This is an automated email from the git hooks/post-receive script. jforbes pushed a commit to branch master in repository kernel-tests. commit 7e2ed1c1e25b1ce470c12592e34aa271988434a2 Author: Dave Jones <davej@xxxxxxxxxx> Date: Thu Oct 4 14:43:52 2012 -0400 Add a performance target. Introduce lmbench as the first perf benchmark --- performance/lmbench3/ACKNOWLEDGEMENTS | 78 + performance/lmbench3/CHANGES | 82 + performance/lmbench3/COPYING | 339 ++ performance/lmbench3/COPYING-2 | 108 + performance/lmbench3/Makefile | 72 + performance/lmbench3/README | 23 + performance/lmbench3/doc/Makefile | 105 + performance/lmbench3/doc/bargraph.1 | 135 + performance/lmbench3/doc/benchmarks | 68 + performance/lmbench3/doc/bw_allmem.tbl | 61 + performance/lmbench3/doc/bw_file_rd.8 | 59 + performance/lmbench3/doc/bw_ipc.tbl | 53 + performance/lmbench3/doc/bw_mem.8 | 95 + performance/lmbench3/doc/bw_mem_rd.8 | 29 + performance/lmbench3/doc/bw_mmap_rd.8 | 46 + performance/lmbench3/doc/bw_pipe.8 | 59 + performance/lmbench3/doc/bw_reread2.tbl | 61 + performance/lmbench3/doc/bw_tcp.8 | 71 + performance/lmbench3/doc/bw_tcp.tbl | 57 + performance/lmbench3/doc/bw_unix.8 | 48 + performance/lmbench3/doc/cache.8 | 49 + performance/lmbench3/doc/ctx.pic | 198 + performance/lmbench3/doc/ctx.tbl | 63 + performance/lmbench3/doc/description.ms | 531 +++ performance/lmbench3/doc/graph.1 | 143 + performance/lmbench3/doc/lat_allmem.tbl | 62 + performance/lmbench3/doc/lat_allproc.tbl | 60 + performance/lmbench3/doc/lat_connect.8 | 47 + performance/lmbench3/doc/lat_connect.tbl | 44 + performance/lmbench3/doc/lat_ctx.8 | 95 + performance/lmbench3/doc/lat_disk.tbl | 23 + performance/lmbench3/doc/lat_fcntl.8 | 32 + performance/lmbench3/doc/lat_fifo.8 | 32 + performance/lmbench3/doc/lat_fs.8 | 37 + performance/lmbench3/doc/lat_fs.tbl | 56 + performance/lmbench3/doc/lat_http.8 | 41 + performance/lmbench3/doc/lat_ipc.tbl | 16 + performance/lmbench3/doc/lat_mem_rd.8 | 97 + performance/lmbench3/doc/lat_mmap.8 | 45 + performance/lmbench3/doc/lat_nullsys.tbl | 58 + performance/lmbench3/doc/lat_ops.8 | 37 + performance/lmbench3/doc/lat_pagefault.8 | 46 + performance/lmbench3/doc/lat_pipe.8 | 38 + performance/lmbench3/doc/lat_pipe.tbl | 58 + performance/lmbench3/doc/lat_proc.8 | 58 + performance/lmbench3/doc/lat_rpc.8 | 68 + performance/lmbench3/doc/lat_select.8 | 33 + performance/lmbench3/doc/lat_sig.8 | 33 + performance/lmbench3/doc/lat_signal.tbl | 48 + performance/lmbench3/doc/lat_syscall.8 | 70 + performance/lmbench3/doc/lat_tcp.8 | 52 + performance/lmbench3/doc/lat_tcp.tbl | 59 + performance/lmbench3/doc/lat_udp.8 | 52 + performance/lmbench3/doc/lat_udp.tbl | 56 + performance/lmbench3/doc/lat_unix.8 | 41 + performance/lmbench3/doc/lat_unix_connect.8 | 43 + performance/lmbench3/doc/line.8 | 50 + performance/lmbench3/doc/lmbench.3 | 344 ++ performance/lmbench3/doc/lmbench.8 | 222 ++ performance/lmbench3/doc/lmbench3.ms | 1853 ++++++++++ performance/lmbench3/doc/lmbench3_arch.fig | 119 + performance/lmbench3/doc/lmbench3_signals.fig | 95 + performance/lmbench3/doc/lmdd.8 | 146 + performance/lmbench3/doc/mem.pic | 2337 ++++++++++++ performance/lmbench3/doc/memhier-color.d | 86 + performance/lmbench3/doc/memhier-line.d | 34 + performance/lmbench3/doc/memhier-tlb.d | 407 +++ performance/lmbench3/doc/memhier.ms | 1576 ++++++++ performance/lmbench3/doc/mhz.8 | 29 + performance/lmbench3/doc/par_mem.8 | 68 + performance/lmbench3/doc/par_ops.8 | 39 + performance/lmbench3/doc/parallel.ms | 385 ++ performance/lmbench3/doc/pgraph.1 | 155 + performance/lmbench3/doc/rccs.1 | 149 + performance/lmbench3/doc/refdbms.keys | 20 + performance/lmbench3/doc/references | 186 + performance/lmbench3/doc/references- | 175 + performance/lmbench3/doc/references-lmbench3 | 430 +++ performance/lmbench3/doc/references-memhier | 251 ++ performance/lmbench3/doc/references-parallel | 171 + performance/lmbench3/doc/references-userguide | 338 ++ performance/lmbench3/doc/references.private | 7 + performance/lmbench3/doc/reporting.3 | 71 + performance/lmbench3/doc/results.3 | 88 + performance/lmbench3/doc/stream.8 | 28 + performance/lmbench3/doc/timing.3 | 163 + performance/lmbench3/doc/tlb.8 | 55 + performance/lmbench3/doc/tmac.usenix | 1848 ++++++++++ performance/lmbench3/doc/usenix.ol | 102 + performance/lmbench3/doc/usenix96.ms | 1798 ++++++++++ performance/lmbench3/doc/userguide.ms | 3782 ++++++++++++++++++++ performance/lmbench3/hbench-REBUTTAL | 245 ++ performance/lmbench3/results/Makefile | 320 ++ performance/lmbench3/runtest.sh | 13 + performance/lmbench3/scripts/Makefile | 8 + performance/lmbench3/scripts/README | 7 + performance/lmbench3/scripts/SHIT | 724 ++++ performance/lmbench3/scripts/TODO | 3 + performance/lmbench3/scripts/allctx | 71 + performance/lmbench3/scripts/allmem | 69 + performance/lmbench3/scripts/bargraph | 430 +++ performance/lmbench3/scripts/bghtml | 39 + performance/lmbench3/scripts/build | 252 ++ performance/lmbench3/scripts/compiler | 16 + performance/lmbench3/scripts/config | 7 + performance/lmbench3/scripts/config-run | 783 ++++ performance/lmbench3/scripts/config-scaling | 160 + performance/lmbench3/scripts/depend | 28 + performance/lmbench3/scripts/do_ctx | 35 + performance/lmbench3/scripts/getbg | 806 +++++ performance/lmbench3/scripts/getbw | 260 ++ performance/lmbench3/scripts/getctx | 79 + performance/lmbench3/scripts/getdisk | 69 + performance/lmbench3/scripts/getlist | 31 + performance/lmbench3/scripts/getmax | 73 + performance/lmbench3/scripts/getmem | 69 + performance/lmbench3/scripts/getpercent | 400 +++ performance/lmbench3/scripts/getresults | 99 + performance/lmbench3/scripts/getsummary | 1089 ++++++ performance/lmbench3/scripts/gifs | 33 + performance/lmbench3/scripts/gnu-os | 1439 ++++++++ performance/lmbench3/scripts/graph | 947 +++++ performance/lmbench3/scripts/html-list | 123 + performance/lmbench3/scripts/html-man | 83 + performance/lmbench3/scripts/info | 7 + performance/lmbench3/scripts/info-template | 42 + performance/lmbench3/scripts/lmbench | 483 +++ performance/lmbench3/scripts/make | 20 + performance/lmbench3/scripts/man2html | 254 ++ performance/lmbench3/scripts/mkrelease | 23 + performance/lmbench3/scripts/new2oldctx | 31 + performance/lmbench3/scripts/opercent | 92 + performance/lmbench3/scripts/os | 20 + performance/lmbench3/scripts/output | 10 + performance/lmbench3/scripts/percent | 95 + performance/lmbench3/scripts/rccs | 733 ++++ performance/lmbench3/scripts/results | 39 + performance/lmbench3/scripts/save | 26 + performance/lmbench3/scripts/stats | 50 + performance/lmbench3/scripts/statsummary | 1075 ++++++ performance/lmbench3/scripts/synchronize | 60 + performance/lmbench3/scripts/target | 24 + performance/lmbench3/scripts/version | 25 + performance/lmbench3/scripts/xroff | 5 + performance/lmbench3/src/Makefile | 506 +++ performance/lmbench3/src/TODO | 107 + performance/lmbench3/src/bench.h | 323 ++ performance/lmbench3/src/bk.ver | 1 + performance/lmbench3/src/busy.c | 10 + performance/lmbench3/src/bw_file_rd.c | 192 + performance/lmbench3/src/bw_mem.c | 468 +++ performance/lmbench3/src/bw_mmap_rd.c | 185 + performance/lmbench3/src/bw_pipe.c | 187 + performance/lmbench3/src/bw_tcp.c | 251 ++ performance/lmbench3/src/bw_udp.c | 203 ++ performance/lmbench3/src/bw_unix.c | 190 + performance/lmbench3/src/cache.c | 750 ++++ performance/lmbench3/src/clock.c | 24 + performance/lmbench3/src/disk.c | 310 ++ performance/lmbench3/src/enough.c | 13 + performance/lmbench3/src/flushdisk.c | 42 + performance/lmbench3/src/getopt.c | 154 + performance/lmbench3/src/hello.c | 8 + performance/lmbench3/src/lat_cmd.c | 100 + performance/lmbench3/src/lat_connect.c | 110 + performance/lmbench3/src/lat_ctx.c | 350 ++ performance/lmbench3/src/lat_dram_page.c | 201 ++ performance/lmbench3/src/lat_fcntl.c | 224 ++ performance/lmbench3/src/lat_fifo.c | 165 + performance/lmbench3/src/lat_fs.c | 272 ++ performance/lmbench3/src/lat_http.c | 128 + performance/lmbench3/src/lat_mem_rd.c | 169 + performance/lmbench3/src/lat_mmap.c | 175 + performance/lmbench3/src/lat_ops.c | 485 +++ performance/lmbench3/src/lat_pagefault.c | 202 ++ performance/lmbench3/src/lat_pipe.c | 155 + performance/lmbench3/src/lat_pmake.c | 158 + performance/lmbench3/src/lat_proc.c | 182 + performance/lmbench3/src/lat_rand.c | 120 + performance/lmbench3/src/lat_rpc.c | 285 ++ performance/lmbench3/src/lat_select.c | 223 ++ performance/lmbench3/src/lat_sem.c | 162 + performance/lmbench3/src/lat_sig.c | 213 ++ performance/lmbench3/src/lat_syscall.c | 175 + performance/lmbench3/src/lat_tcp.c | 175 + performance/lmbench3/src/lat_udp.c | 207 ++ performance/lmbench3/src/lat_unix.c | 130 + performance/lmbench3/src/lat_unix_connect.c | 102 + performance/lmbench3/src/lat_usleep.c | 259 ++ performance/lmbench3/src/lib_debug.c | 131 + performance/lmbench3/src/lib_debug.h | 10 + performance/lmbench3/src/lib_mem.c | 699 ++++ performance/lmbench3/src/lib_mem.h | 60 + performance/lmbench3/src/lib_sched.c | 239 ++ performance/lmbench3/src/lib_stats.c | 603 ++++ performance/lmbench3/src/lib_tcp.c | 238 ++ performance/lmbench3/src/lib_tcp.h | 12 + performance/lmbench3/src/lib_timing.c | 1774 +++++++++ performance/lmbench3/src/lib_udp.c | 96 + performance/lmbench3/src/lib_udp.h | 12 + performance/lmbench3/src/lib_unix.c | 97 + performance/lmbench3/src/lib_unix.h | 8 + performance/lmbench3/src/line.c | 68 + performance/lmbench3/src/lmdd.1 | 131 + performance/lmbench3/src/lmdd.c | 893 +++++ performance/lmbench3/src/lmhttp.c | 397 ++ performance/lmbench3/src/loop_o.c | 8 + performance/lmbench3/src/memsize.c | 192 + performance/lmbench3/src/mhz.c | 507 +++ performance/lmbench3/src/msleep.c | 21 + performance/lmbench3/src/names.h | 102 + performance/lmbench3/src/par_mem.c | 81 + performance/lmbench3/src/par_ops.c | 501 +++ performance/lmbench3/src/rhttp.c | 125 + performance/lmbench3/src/seek.c | 65 + performance/lmbench3/src/stats.h | 61 + performance/lmbench3/src/stream.c | 309 ++ performance/lmbench3/src/timing.h | 52 + performance/lmbench3/src/timing_o.c | 10 + performance/lmbench3/src/tlb.c | 178 + performance/lmbench3/src/version.h | 2 + performance/lmbench3/src/webpage-lm.tar | Bin 0 -> 61440 bytes performance/lmbench3/src/webpage-lm/URLS | 14 + performance/lmbench3/src/webpage-lm/gifs/blueline | Bin 0 -> 596 bytes .../lmbench3/src/webpage-lm/gifs/cclip3.gif | Bin 0 -> 640 bytes performance/lmbench3/src/webpage-lm/gifs/eyes.gif | Bin 0 -> 125 bytes .../lmbench3/src/webpage-lm/gifs/eyesleft.gif | Bin 0 -> 125 bytes performance/lmbench3/src/webpage-lm/gifs/line1.gif | Bin 0 -> 270 bytes performance/lmbench3/src/webpage-lm/gifs/new.gif | Bin 0 -> 116 bytes .../lmbench3/src/webpage-lm/gifs/pookline.gif | Bin 0 -> 773 bytes .../lmbench3/src/webpage-lm/gifs/rib_bar_wh.gif | Bin 0 -> 752 bytes .../lmbench3/src/webpage-lm/gifs/sgi_logo.gif | Bin 0 -> 4002 bytes .../lmbench3/src/webpage-lm/gifs/snow-bg2.jpg | Bin 0 -> 3830 bytes .../lmbench3/src/webpage-lm/gifs/spam-not.gif | Bin 0 -> 1322 bytes performance/lmbench3/src/webpage-lm/index.html | 253 ++ .../lmbench3/src/webpage-lm/pictures/me-small.jpg | Bin 0 -> 16292 bytes runtests.sh | 36 +- 237 files changed, 50723 insertions(+), 14 deletions(-) diff --git a/performance/lmbench3/ACKNOWLEDGEMENTS b/performance/lmbench3/ACKNOWLEDGEMENTS new file mode 100644 index 0000000..788a59f --- /dev/null +++ b/performance/lmbench3/ACKNOWLEDGEMENTS @@ -0,0 +1,78 @@ +LMbench was originally developed by Larry McVoy while he worked +at Sun Microsystems. Larry continued development while working +at Silicon Graphics, and was joined by Carl Staelin, who works +for Hewlett-Packard Laboratories. + +LMbench would not be the successful cross-platform benchmark +that it is today without the efforts and assistance of a wide +range of people. From volunteers who run it on various hardware +and report bugs, to managers who provide financial and other +support, to peers and colleagues who request features or +provide feedback on design elements. All such help has been +critical to making LMbench a success. + +Below is a partial list of all those people who helped support +the development of LMbench in one form or other, such as +benchmark suggestions, bug reports, and so forth. All omissions +are accidental, and if your name was not included, please accept +our humble apologies. + +The people who have helped LMbench include, in alphabetic +order: + +Ralf Baechle, +Christian Bau, +Nelson H. F. Beebe, +Anton Blanchard, +Joel Berman, +Paul Borrill, +Ed Bradford, +Len Brown, +Robert G. Brown, +Bruce Chapman, +Mark Culotta, +Fred Douglis, +Lars-Eke Eriksson, +Josh Fisher, +Marc Fleischmann, +John Fort, +Andy Glew, +Achim Gratz, +Richard Henderson, +Lev Iserovich, +Michael A. Julier, +Frans Kaashoek, +Brad Knowles, +Richard Littin, +Bil Long, +Udi Manber, +John Mashey, +David Miller, +Dejan Milojicic, +Ingo Molnar, +David Mosberger, +Satya Nishtala, +Kevin Normoyle, +Neal Nuckolls, +Steve Piatz, +Tim Prince, +James Riden, +Sam Roberts, +Philip Roth, +Chris Ruemmler, +Olli Savia, +Scott Schwartz, +Wayne Scott, +Stephan Somogyi, +Ratnakar Tiwari, +Linus Torvalds, +Dan Truong, +Dirk Twiehaus, +Duc Vianney, +Ramya Vijay, +Hai Vo-Ba, +David T. Wang, +Brian Whitney, +David Wilson, +Mitch Wright. + diff --git a/performance/lmbench3/CHANGES b/performance/lmbench3/CHANGES new file mode 100644 index 0000000..f1228a2 --- /dev/null +++ b/performance/lmbench3/CHANGES @@ -0,0 +1,82 @@ +lmbench3-alpha1 + Added new benchmark line, which determines the cache line size + + Added new benchmark tlb, which determines the effective TLB size. + Note that this may differ from the hardware TLB size due to OS + TLB entries and super-pages. + + Added new benchmark par_mem, which determines the possible + speedup due to multiple memory reads progressing in parallel. + This number usually depends highly on the portion of the + memory hierarchy being probed, with higher caches generally + having greater parallelism. + + Added new benchmark cache, which determines the number of caches, + their sizes, latency, and available parallelism. It also + reports the latency and available parallelism for main memory. + + Added new benchmark lat_ops, which attempts to determine the + latency of basic operations, such as add, multiply and divide, + for a variety of data types, such as int, int64, float and + double. + + Added new benchmark par_ops, which attempts to determine the + available scaling of the various basic operations for various + data types. + + Added new benchmark stream, which reports memory bandwidth + numbers using benchmark kernels from John McCalpin's STREAM + and STREAM version 2 benchmarks. + + Added new benchmark lat_sem, which reports SysV semaphore latency. + + Added getopt() command line parsing to most benchmarks. + + Added a new benchmark timing harness, benchmp(), which makes + it relatively easy to design and build benchmarks which + measure system performance under a fixed load. It takes + a few parameters: + - initialize: a function pointer. If this is non-NULL + the function is called in the child processes after + the fork but before any benchmark-related work is + done. The function is passed a cookie from the + benchmp() call. This can be a pointer to a + data structure which lets the function know what + it needs to do. + - benchmark: a function pointer. This function + takes two parameters, an iteration count "iters", + and a cookie. The benchmarked activity must be + run "iters" times (or some integer multiple of + "iters". This function must be idempotent; ie., + the benchmark harness must be able to call it + as many times as necessary. + - cleanup: a function pointer. If this is non-NULL + the function is called after all benchmarking is + completed to cleanup any resources that may have + been allocated. + - enough: If this is non-zero then it is the minimum + amount of time, in micro-seconds, that the benchmark + must be run to provide reliable results. In most + cases this is left to zero to allow the harness to + autoscale the timing intervals to the system clock's + resolution/accuracy. + - parallel: this is the number of child processes + running the benchmark that should be run in parallel. + This is really the load factor. + - warmup: a time period in micro-seconds that each + child process must run the benchmarked process + before any timing intervals can begin. This is + to allow the system scheduler time to settle in + a parallel/distributed system before we begin + measurements. (If so desired) + - repetitions: If non-zero this is the number of + times we need to repeat each measurement. The + default is 11. + - cookie: An opaque value which can be used to + pass information to the initialize(), benchmark(), + and cleanup() routines. + This new harness is now used by: bw_file_rd, bw_mem, bw_mmap_rd, + bw_pipe, bw_tcp, bw_unix, lat_connect, lat_ctx, lat_fcntl, + lat_fifo, lat_mem_rd, lat_mmap, lat_ops, lat_pagefault, lat_pipe, + lat_proc, lat_rpc, lat_select, lat_sem, lat_sig, lat_syscall, + lat_tcp, lat_udp, lat_unix, lat_unix_connect, and stream. diff --git a/performance/lmbench3/COPYING b/performance/lmbench3/COPYING new file mode 100644 index 0000000..a43ea21 --- /dev/null +++ b/performance/lmbench3/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 675 Mass Ave, Cambridge, MA 02139, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) 19yy <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/performance/lmbench3/COPYING-2 b/performance/lmbench3/COPYING-2 new file mode 100644 index 0000000..3e1f7cc --- /dev/null +++ b/performance/lmbench3/COPYING-2 @@ -0,0 +1,108 @@ +%M% %I% %E% + +The set of programs and documentation known as "lmbench" are distributed +under the Free Software Foundation's General Public License with the +following additional restrictions (which override any conflicting +restrictions in the GPL): + +1. You may not distribute results in any public forum, in any publication, + or in any other way if you have modified the benchmarks. + +2. You may not distribute the results for a fee of any kind. This includes + web sites which generate revenue from advertising. + +If you have modifications or enhancements that you wish included in +future versions, please mail those to me, Larry McVoy, at lm@xxxxxxxxxxxx. + +========================================================================= + +Rationale for the publication restrictions: + +In summary: + + a) LMbench is designed to measure enough of an OS that if you do well in + all catagories, you've covered latency and bandwidth in networking, + disks, file systems, VM systems, and memory systems. + b) Multiple times in the past people have wanted to report partial results. + Without exception, they were doing so to show a skewed view of whatever + it was they were measuring (for example, one OS fit small processes into + segments and used the segment register to switch them, getting good + results, but did not want to report large process context switches + because those didn't look as good). + c) We insist that if you formally report LMbench results, you have to + report all of them and make the raw results file easily available. + Reporting all of them means in that same publication, a pointer + does not count. Formally, in this context, means in a paper, + on a web site, etc., but does not mean the exchange of results + between OS developers who are tuning a particular subsystem. + +We have a lot of history with benchmarking and feel strongly that there +is little to be gained and a lot to be lost if we allowed the results +to be published in isolation, without the complete story being told. + +There has been a lot of discussion about this, with people not liking this +restriction, more or less on the freedom principle as far as I can tell. +We're not swayed by that, our position is that we are doing the right +thing for the OS community and will stick to our guns on this one. + +It would be a different matter if there were 3 other competing +benchmarking systems out there that did what LMbench does and didn't have +the same reporting rules. There aren't and as long as that is the case, +I see no reason to change my mind and lots of reasons not to do so. I'm +sorry if I'm a pain in the ass on this topic, but I'm doing the right +thing for you and the sooner people realize that the sooner we can get on +to real work. + +Operating system design is a largely an art of balancing tradeoffs. +In many cases improving one part of the system has negative effects +on other parts of the system. The art is choosing which parts to +optimize and which to not optimize. Just like in computer architecture, +you can optimize the common instructions (RISC) or the uncommon +instructions (CISC), but in either case there is usually a cost to +pay (in RISC uncommon instructions are more expensive than common +instructions, and in CISC common instructions are more expensive +than required). The art lies in knowing which operations are +important and optmizing those while minimizing the impact on the +rest of the system. + +Since lmbench gives a good overview of many important system features, +users may see the performance of the system as a whole, and can +see where tradeoffs may have been made. This is the driving force +behind the publication restriction: any idiot can optimize certain +subsystems while completely destroying overall system performance. +If said idiot publishes *only* the numbers relating to the optimized +subsystem, then the costs of the optimization are hidden and readers +will mistakenly believe that the optimization is a good idea. By +including the publication restriction readers would be able to +detect that the optimization improved the subsystem performance +while damaging the rest of the system performance and would be able +to make an informed decision as to the merits of the optimization. + +Note that these restrictions only apply to *publications*. We +intend and encourage lmbench's use during design, development, +and tweaking of systems and applications. If you are tuning the +linux or BSD TCP stack, then by all means, use the networking +benchmarks to evaluate the performance effects of various +modifications; Swap results with other developers; use the +networking numbers in isolation. The restrictions only kick +in when you go to *publish* the results. If you sped up the +TCP stack by a factor of 2 and want to publish a paper with the +various tweaks or algorithms used to accomplish this goal, then +you can publish the networking numbers to show the improvement. +However, the paper *must* also include the rest of the standard +lmbench numbers to show how your tweaks may (or may not) have +impacted the rest of the system. The full set of numbers may +be included in an appendix, but they *must* be included in the +paper. + +This helps protect the community from adopting flawed technologies +based on incomplete data. It also helps protect the community from +misleading marketing which tries to sell systems based on partial +(skewed) lmbench performance results. + +We have seen many cases in the past where partial or misleading +benchmark results have caused great harm to the community, and +we want to ensure that our benchmark is not used to perpetrate +further harm and support false or misleading claims. + + diff --git a/performance/lmbench3/Makefile b/performance/lmbench3/Makefile new file mode 100644 index 0000000..77671ff --- /dev/null +++ b/performance/lmbench3/Makefile @@ -0,0 +1,72 @@ +# Makefile for top level of lmbench +# $Id: Makefile 1.17 00/05/31 16:16:15+03:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ + +# Possible things to $(MAKE): +# +# build (default) go to the source directory and build the benchmark +# results go to the source directory and build and run the benchmark +# rerun run the benchmark again +# see see the results that came with this release +# Go to the results directory and read the Makefile. +# doc.lpr print the documentation +# doc.x preview the documentation (needs X, groff, pic, etc) +# clean go to the subdirs and $(MAKE) clean +# get $(MAKE) sure all files are checked out +# shar build a shippable shar archive + +SHELL=/bin/sh + +build: + cd src && $(MAKE) + +results: FRC + cd src && $(MAKE) results + +rerun: + cd src && $(MAKE) rerun + +see: + cd results && $(MAKE) summary percent 2>/dev/null | more + +doc.lpr: + cd doc && $(MAKE) PS && lpr *.PS + +doc.x: + cd doc && $(MAKE) x + +clobber clean: + for i in doc src results scripts; do \ + echo ===== $$i =====; \ + (cd $$i && $(MAKE) clean); \ + done + /bin/rm -rf bin/* + +get: + for i in doc src results scripts; do \ + echo ===== $$i =====; \ + (cd $$i && bk get -q); \ + done + @co -q + +info: + for i in doc src results scripts; do \ + echo ===== $$i =====; \ + (cd $$i && info); \ + done + +release: scripts/mkrelease + scripts/mkrelease + +scripts/mkrelease: + cd scripts && co mkrelease + +# XXX - . must be named lmbench for this to work +shar: + $(MAKE) clean + co -q Makefile + $(MAKE) get + cd .. && \ + find lmbench -type f -print | egrep -v 'noship|RCS' > /tmp/FILES + cd .. && shar -S -a -n lmbench1.0 -L 50K < /tmp/FILES + +FRC: diff --git a/performance/lmbench3/README b/performance/lmbench3/README new file mode 100644 index 0000000..81a505d --- /dev/null +++ b/performance/lmbench3/README @@ -0,0 +1,23 @@ +README for lmbench 2alpha8 net release. + +To run the benchmark, you should be able to say: + + cd src + make results + +If you want to see how you did compared to the other system results +included here, say + + make see + +Be warned that many of these benchmarks are sensitive to other things +being run on the system, mainly from CPU cache and CPU cycle effects. +So make sure your screen saver is not running, etc. + +It's a good idea to do several runs and compare the output like so + + make results + make rerun + make rerun + make rerun + cd Results && make LIST=<your OS>/* diff --git a/performance/lmbench3/doc/Makefile b/performance/lmbench3/doc/Makefile new file mode 100644 index 0000000..6fa93cb --- /dev/null +++ b/performance/lmbench3/doc/Makefile @@ -0,0 +1,105 @@ +# Makefile for lmbench doc subdir. +# $Id: Makefile 1.20 03/03/10 10:26:17+02:00 staelin@xxxxxxxxxxxxxxxxxxxxxx $ + +SHELL=/bin/sh +DESC = description.ms +USENIX = tmac.usenix usenix96.ms +PIC = ctx.pic mem.pic +SCRIPTS = ../scripts/ +BASE=/usr/local +MANDIR=${BASE}/man + +MAN = \ + bargraph.1 graph.1 \ + lmbench.3 reporting.3 results.3 timing.3 \ + lmbench.8 mhz.8 cache.8 line.8 tlb.8 lmdd.8 \ + lat_proc.8 lat_mmap.8 lat_ctx.8 lat_syscall.8 lat_pipe.8 \ + lat_http.8 lat_tcp.8 lat_udp.8 lat_rpc.8 lat_connect.8 lat_fs.8 \ + lat_ops.8 lat_pagefault.8 lat_mem_rd.8 lat_select.8 \ + lat_fifo.8 lat_fcntl.8 lat_sig.8 lat_unix.8 lat_unix_connect.8 \ + bw_file_rd.8 bw_mem.8 bw_mmap_rd.8 \ + bw_pipe.8 bw_tcp.8 bw_unix.8 \ + par_ops.8 par_mem.8 + +ALL = $(DESC) $(USENIX) $(PIC) $(MAN) $(REFER) references + +.SUFFIXES: .pic .fig + +.fig.pic: + fig2dev -L pic $< $*.pic + +PS ps: $(ALL) + gindxbib references + groff -t -e -G -s -p -R $(USENIX) > USENIX.PS + #groff -s -p -mgs $(DESC) > DESC.PS + #groff -fH -man $(MAN) > MAN.PS + +X x: $(ALL) + gindxbib references + $(SCRIPTS)xroff -t -e -s -p -R $(USENIX) + #$(SCRIPTS)xroff -s -p -mgs $(DESC) + #$(SCRIPTS)xroff -man -fH $(MAN) + +text: $(ALL) + gindxbib references + gsoelim usenix96.ms | sed "s/expand doublebox/center/" | \ + sed s/doublebox// > Fixed.ms + groff -Tascii -t -e -s -p -R -mgs Fixed.ms 2>/dev/null | colcrt - | more + +userguide.ps: $(ALL) references-userguide userguide.ms \ + lmbench3_arch.pic lmbench3_signals.pic ctx.tbl \ + bw_allmem.tbl bw_ipc.tbl bw_reread2.tbl bw_tcp.tbl \ + lat_allmem.tbl lat_allproc.tbl lat_connect.tbl \ + lat_disk.tbl lat_fs.tbl lat_ipc.tbl lat_nullsys.tbl \ + lat_pipe.tbl lat_signal.tbl lat_tcp.tbl lat_udp.tbl + gindxbib references-userguide + groff -t -e -G -s -p -R tmac.usenix userguide.ms > userguide.ps + +memhier.ps: $(ALL) memhier-color.d memhier-tlb.d memhier-line.d references-memhier memhier.ms + gindxbib references-memhier + groff -G -t -e -s -p -R tmac.usenix memhier.ms > memhier.ps +# ../scripts/graph -xm -logx -small -below -nomarks -nospace memhier-color.graph > memhier-color.pic +# ../scripts/graph -xm -logx -small -below -nomarks -nospace memhier-line.graph > memhier-line.pic +# ../scripts/graph -logx -small -below -nomarks -nospace memhier-tlb.graph > memhier-tlb.pic + +lmbench3.ps: $(ALL) references-lmbench3 lmbench3.ms \ + lmbench3_arch.pic lmbench3_signals.pic + gindxbib references-lmbench3 + groff -G -t -e -s -p -R tmac.usenix lmbench3.ms > lmbench3.ps + +parallel.ps: $(ALL) references-parallel parallel.ms + gindxbib references-parallel + groff -G -t -e -s -p -R tmac.usenix parallel.ms > parallel.ps + +install: $(MAN) + for f in $(MAN); do \ + for s in 1 2 3 4 5 6 7 8 9; do \ + if [ ! -d ${MANDIR}/man$${s} ]; then \ + mkdir -p ${MANDIR}/man$${s}; \ + fi; \ + base=`basename $${f} .$${s}`; \ + if [ "$${base}.$${s}" = "$$f" ]; then \ + cp $$f ${MANDIR}/man$${s}/; \ + fi; \ + done; \ + done + +get: $(ALL) + +edit: + get -e -s $(ALL) + +$(MAN): + get -s $(MAN) + +$(PIC): + get -s $(PIC) + +$(DESC): + get -s $(DESC) + +$(USENIX): + get -s $(USENIX) + +clean: + /bin/rm -f *.PS XXX bw.pic memrd_bcopy_comp.pic references.i diff --git a/performance/lmbench3/doc/bargraph.1 b/performance/lmbench3/doc/bargraph.1 new file mode 100644 index 0000000..226caa7 --- /dev/null +++ b/performance/lmbench3/doc/bargraph.1 @@ -0,0 +1,135 @@ +.\" $Id: bargraph.1 1.1 94/11/22 23:04:09-08:00 lm@xxxxxxxxxxxxxxx $ +.TH BARGRAPH 1 +.SH NAME +bargraph \- compile bar graphs into pic input +.SH SYNOPSIS +.B bargraph +[ +.I filename +\&.\|.\|. +] +.SH DESCRIPTION +.LP +.B bargraph +is a perl script which +takes a set of Y data with labels and generates a (human readable) pic script +that will produce the bar graph. +The output (pic input) is commented and is designed such that you should be +able to go in and adjust it to fit your document should you need to do so. +.LP +The input data format is: +.sp +.nf +.in +4 +3 foo bar +9 bigger foo +"Silly example +.in +.fi +.sp +with output like +.sp +.nf +.in +2 +.ft CW + bigger + foo + +----------+ + | | + foo | | + bar | | + +----------+ | | + | | | | + +----------+ +----------+ +------------------------------- + 3 9 + + Silly example +.ft +.in +.fi +.SH OPTIONS +The following command line options are available +.TP 10 +-big +Make the x/y defaults be 7.5 inches, crank up the title size, and don't +put a spacer at the top. Used for printing a graph on a full page. +.TP +-nobox +Do not put an outline box around the bargraph. +.SH "CONTROL OPTIONS" +The following may be included in the graph to control the format +of the graph. They must be at the beginning of a line and by themselves. +.TP 18 +%ps <ps> +point size. Default is 10. +.TP +%ft <ft> +font. Default is CB. +.TP +%labelgap <val> +the space in inches between fill labels. The bars may be filled with different +fill values (no patterns yet, pic doesn't do that). If you want to label +these, the labels are labelgap inches apart. Default is 1.5 inches. +.TP +%xsize <val> +the width of the graph in inches. Default is 7 inches. +.TP +%ysize <val> +the height of the graph in inches. Default is 6 inches. +.TP +%Title n|s <title> +the title of the bargraph. The title option is followed by a +a "n"orth (top) or "s"outh (bottom) indicator which controls placement +of the title. No default. +.TP +%titleplus <val> +increases the size of the title in pointsize. Default is 0. +.TP +%boxpercent <val> +a value between 0 and 100 that controls how closely the +bars are to each other. A value of 100 means the bars touch. +Default is 75. +.TP +%worse <D> <W> +An idiot arrow is drawn to indicate which way is worse. +<D> is the direction and must be "up" or "down". +<W> is the location specifier and must be one of +"n"orth, "w"est, "e"ast, "s"outh, "nw" northwest, ne, sw, se, etc. +.TP +%better <D> <W> +An idiot arrow is drawn to indicate which way is better. +<D> is the direction and must be "up" or "down". +<W> is the location specifier and must be one of +"n"orth, "w"est, "e"ast, "s"outh, "nw" northwest, ne, sw, se, etc. +.TP +%fakemax +pretend that one data point was this big when autoscaling. THis +is used to make a series of bargraphs be all drawn to the same +scale. +.SH "FILL CONTROL" +Each datum may be follwed by a fill specifier as follows +.sp .5 +.ti +.5i +3 foo bar %fill.5 +.sp .5 +Labels may be specified to group a set of data that all have +the same data. If a line appears like +.sp .5 +.ti +.5i +%label.5 The foo bar data +.sp .5 +then you get a label below the graph. +.SH "SEE ALSO" +.BR gtroff (1), +.BR graph (1), +.BR gpic (1) +.SH TODO +Make a -horizontal option that prints the graphs the other way. +.LP +Hack pick to get access to postscripts stipple patterns. +.SH BUGS +This isn't done. It isn't integrated with the groff preprocessor yet. +It doesn't know about .GS/.GE thingys. I use it to manually generate +a pic file and then include that. I have to talk to James to +see if he wants it as part of the gpic stuff. diff --git a/performance/lmbench3/doc/benchmarks b/performance/lmbench3/doc/benchmarks new file mode 100644 index 0000000..d997811 --- /dev/null +++ b/performance/lmbench3/doc/benchmarks @@ -0,0 +1,68 @@ +Theme + Data movement and the cost thereof + Latency + Time per operation + CPU cycles per operation + Bandwidth + MB / sec + CPU cycles / MB + Media + Memory (load, bcopy) + Disk (randoms, sequentials) + File system (directory ops, sequential) + Network (hot potato, transfer) + Pipes (hot potato, transfer) + VM system (mmaps/munmaps, bcopy) + Systems + All Unix systems + Windows NT + VMS (?) + Mainframes (?) +Memory + Small transfers (randoms) + Load latency + Large transfers (sequential) + Bcopy bandwidth +Processes + Null process execution time + Context switching +Misc + Null entry into the system +Networking + Small transfers (randoms) + Transfers per second + CPU cycles per transfer + socket/bind/close per second + Large transfers (sequential) + MB per second + CPU cycles per MB +Disks + Small transfers (randoms) + Transfers per second + CPU cycles per transfer + Large transfers (sequential) + MB per second + CPU cycles per MB +File system + Small transfers (randoms) + Creates / second + Removes / second + Random I/O's per second in large file + CPU cycles per transfer + MB / sec when reading many related small files + Large files + MB / second read/write + CPU cycles per MB + Hardness + Measure fsck time? +Virtual memory system + Creation + mmaps per second + munmaps per second + Also vary size of mapped region + Small transfers (randoms) + Random reads per second of large mmaped file + CPU cycles per read + Large transfers (cached sequential) + MB per second read rate + CPU cycles per MB diff --git a/performance/lmbench3/doc/bw_allmem.tbl b/performance/lmbench3/doc/bw_allmem.tbl new file mode 100644 index 0000000..a0016c0 --- /dev/null +++ b/performance/lmbench3/doc/bw_allmem.tbl @@ -0,0 +1,61 @@ +.KS +.TS +expand doublebox; +c|c s|c s +l|c c|c c +l|r r|r r. + Bcopy Memory +System \fBunrolled\fP libc read write += +DEC Alpha 41 39 76 78\ +DEC Alpha 46 46 88 91\ +DEC Alpha 46 45 79 91\ +DEC Alpha 38 40 69 84\ +SunOS-5.4 sun4d 22 21 47 38\ +DEC Alpha 36 36 55 72\ +DEC Alpha 38 38 64 79\ +SunOS-5.4 sun4m 25 23 64 51\ +SunOS-5.4 sun4m 24 23 59 40\ +SunOS-5.4 sun4d 16 14 36 28\ +SunOS-5.4 sun4m 31 26 80 62\ +Sun SC1000 17 15 38 31\ +Sun Ultra1 85 167 129 152\ +Linux alpha 40 40 74 72\ +Linux i686 42 57 205 56\ +Linux i586 30 31 61 50\ +Linux alpha 39 39 73 71\ +Unixware/i686 65 55 214 86\ +Linux i586 38 42 74 75\ +IBM Power2 242 171 205 364\ +IBM PowerPC 21 21 63 26\ +dgux mc88110 17 17 37 19\ +DEC Alpha 15 15 46 20\ +IRIX64 IP21 68 70 92 90\ +IRIX64-601 IP26 41 32 65 61\ +Linux i586 38 41 74 75\ +Linux i586 20 21 60 31\ +Linux i586 20 21 58 30\ +Linux i586 20 21 60 31\ +Linux i486 16 17 33 41\ +HP-UX 9000/819 55 48 97 89\ +FreeBSD/i586 39 42 73 83\ +FreeBSD/i586 38 41 65 83\ +FreeBSD/i586 38 41 65 83\ +HP-UX 9000/735 32 26 55 52\ +HP-UX 9000/735 32 26 54 51\ +FreeBSD/i586 36 40 62 83\ +IRIX64 IP25 53 41 87 72\ +IRIX64 IP19 32 34 65 67\ +HP-UX 9000/735 31 26 53 51\ +HP-UX 9000/735 32 26 53 51\ +HP-UX 9000/755 31 25 49 52\ +HP-UX 9000/770 31 33 56 61\ +HP-UX 9000/897 19 19 40 37\ +IRIX64 IP19 35 36 65 67\ +IRIX IP19 33 34 67 72\ +IRIX5.3 IP19 32 34 65 68\ +IRIX IP22 32 33 68 72\ +IRIX5.3 IP22 31 32 69 66\ +FreeBSD/i586 39 42 65 83\ +.TE +.KE diff --git a/performance/lmbench3/doc/bw_file_rd.8 b/performance/lmbench3/doc/bw_file_rd.8 new file mode 100644 index 0000000..487e6f4 --- /dev/null +++ b/performance/lmbench3/doc/bw_file_rd.8 @@ -0,0 +1,59 @@ +.\" $Id: bw_file_rd.8 1.2 00/10/16 17:13:35+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH BW_FILE_RD 8 "$Date: 00/10/16 17:13:35+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +bw_file_rd \- time the reading and summing of a file +.SH SYNOPSIS +.B bw_file_rd +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I size +.I file +.SH DESCRIPTION +.B bw_file_rd +times the read of the specified file in 64KB blocks. Each block is summed +up as a seried of 4 byte integers in an unrolled loop. +Results are reported in megabytes read per second. +.LP +The data is not accessed in the user program; the benchmark relies on +the operating systems read interface to have actually moved the data. +Systems that implement page flipping may fool this benchmark. +.LP +The benchmark is intended to be used on a file +that is in memory, i.e., the benchmark is a reread benchmark. Other +file benchmarking can be done with +.BR lmdd (8). +.LP +The size +specification may end with ``k'' or ``m'' to mean +kilobytes (* 1024) or megabytes (* 1024 * 1024). +.SH OUTPUT +Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., +.sp +.ft CB +8.00 25.33 +.ft +.SH MEMORY UTILIZATION +This benchmark can move up to three times the requested memory. Most Unix +systems implement the read system call as a bcopy from kernel space +to user space. Bcopy will use 2-3 times as much memory bandwidth: +there is one read from the source and a write to the destionation. The +write usually results in a cache line read and then a write back of +the cache line at some later point. Memory utilization might be reduced +by 1/3 if the processor architecture implemented ``load cache line'' +and ``store cache line'' instructions (as well as ``getcachelinesize''). +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/bw_ipc.tbl b/performance/lmbench3/doc/bw_ipc.tbl new file mode 100644 index 0000000..b106e06 --- /dev/null +++ b/performance/lmbench3/doc/bw_ipc.tbl @@ -0,0 +1,53 @@ +.KS +.TS +expand doublebox; +l c c c +l r r r. +System bcopy \fBpipe\fP TCP += +DEC Alpha 36 32 9\ +DEC Alpha 46 54 11\ +DEC Alpha 38 23 7\ +DEC Alpha 45 35 9\ +DEC Alpha 39 32 12\ +Linux alpha 39 73 9\ +Sun Ultra1 167 61 51\ +SunOS-5.4 sun4m 26 11 11\ +SunOS-5.4 sun4m 23 24 19\ +DEC Alpha 40 24 6\ +DEC Alpha 15 17 4\ +Linux alpha 40 73 9\ +Linux i586 42 34 7\ +Linux i486 17 16 6\ +Linux i586 31 24 3\ +IBM Power2 171 84 10\ +IBM PowerPC 21 30 17\ +SunOS-5.4 sun4d 14 7 8\ +HP-UX 9000/735 26 37 24\ +SunOS-5.4 sun4m 23 7 9\ +Linux i686 57 73 15\ +Linux i586 41 22 5\ +Linux i586 21 19 3\ +Linux i586 21 18 3\ +Linux i586 21 12 3\ +Sun SC1000 15 9 11\ +SunOS-5.4 sun4d 21 8 9\ +IRIX5.3 IP22 32 34 22\ +IRIX64-601 IP26 32 37 22\ +HP-UX 9000/770 33 53 21\ +HP-UX 9000/819 48 37 28\ +HP-UX 9000/755 25 38 35\ +IRIX IP22 33 32 7\ +IRIX64 IP21 70 28 19\ +HP-UX 9000/735 26 44 20\ +HP-UX 9000/735 26 42 18\ +HP-UX 9000/735 26 39 19\ +IRIX64 IP25 41 40 26\ +IRIX64 IP19 34 27 19\ +IRIX64 IP19 36 17 31\ +IRIX IP19 34 14 16\ +IRIX5.3 IP19 34 12 12\ +HP-UX 9000/897 19 26 17\ +dgux mc88110 17 8 5\ +.TE +.KE diff --git a/performance/lmbench3/doc/bw_mem.8 b/performance/lmbench3/doc/bw_mem.8 new file mode 100644 index 0000000..50ed049 --- /dev/null +++ b/performance/lmbench3/doc/bw_mem.8 @@ -0,0 +1,95 @@ +.\" $Id: bw_mem.8 1.4 00/10/16 17:13:36+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH BW_MEM 8 "$Date: 00/10/16 17:13:36+02:00 $" "(c)1994-2000 Larry McVoy and Carl Staelin" "LMBENCH" +.SH NAME +bw_mem \- time memory bandwidth +.SH SYNOPSIS +.B bw_mem_cp +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I size +.I rd|wr|rdwr|cp|fwr|frd|bzero|bcopy +.I [align] +.SH DESCRIPTION +.B bw_mem +allocates twice the specified amount of memory, zeros it, and then times +the copying of the first half to the second half. Results are reported +in megabytes moved per second. +.LP +The size +specification may end with ``k'' or ``m'' to mean +kilobytes (* 1024) or megabytes (* 1024 * 1024). +.SH OUTPUT +Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., +.sp +.ft CB +8.00 25.33 +.ft +.LP +There are nine different memory benchmarks in +.BR bw_mem . +They each measure slightly different methods for reading, writing or +copying data. +.TP +.B "rd" +measures the time to read data into the processor. It computes the +sum of an array of integer values. It accesses every fourth word. +.TP +.B "wr" +measures the time to write data to memory. It assigns a constant +value to each memory of an array of integer values. +It accesses every fourth word. +.TP +.B "rdwr" +measures the time to read data into memory and then write data to +the same memory location. For each element in an array it adds +the current value to a running sum before assigning a new (constant) +value to the element. +It accesses every fourth word. +.TP +.B "cp" +measures the time to copy data from one location to another. It +does an array copy: dest[i] = source[i]. +It accesses every fourth word. +.TP +.B "frd" +measures the time to read data into the processor. It computes the +sum of an array of integer values. +.TP +.B "fwr" +measures the time to write data to memory. It assigns a constant +value to each memory of an array of integer values. +.TP +.B "fcp" +measures the time to copy data from one location to another. It +does an array copy: dest[i] = source[i]. +.TP +.B "bzero" +measures how fast the system can +.I bzero +memory. +.TP +.B "bcopy" +measures how fast the system can +.I bcopy +data. +.SH MEMORY UTILIZATION +This benchmark can move up to three times the requested memory. +Bcopy will use 2-3 times as much memory bandwidth: +there is one read from the source and a write to the destionation. The +write usually results in a cache line read and then a write back of +the cache line at some later point. Memory utilization might be reduced +by 1/3 if the processor architecture implemented ``load cache line'' +and ``store cache line'' instructions (as well as ``getcachelinesize''). +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/bw_mem_rd.8 b/performance/lmbench3/doc/bw_mem_rd.8 new file mode 100644 index 0000000..11e5c48 --- /dev/null +++ b/performance/lmbench3/doc/bw_mem_rd.8 @@ -0,0 +1,29 @@ +.\" $Id: bw_mem_rd.8 1.1 94/11/18 01:26:35-08:00 lm@xxxxxxxxxxxxxxx $ +.TH BW_MEM_RD 8 "$Date: 94/11/18 01:26:35-08:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +bw_mem_rd \- time memory read rate (with overhead) +.SH SYNOPSIS +.B bw_mem_rd +.I size +.SH DESCRIPTION +.B bw_mem_rd +allocates the specified amount of memory, zeros it, and then times the +reading of that memory as a series of integer loads and adds. Each +four byte integer is loaded and added to accumulator. +.LP +The size +specification may end with ``k'' or ``m'' to mean +kilobytes (* 1024) or megabytes (* 1024 * 1024). +.SH OUTPUT +Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., +.sp +.ft CB +8.00 25.33 +.ft +.SH MEMORY UTILIZATION +This benchmark should move approximately the reported amount of memory. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). diff --git a/performance/lmbench3/doc/bw_mmap_rd.8 b/performance/lmbench3/doc/bw_mmap_rd.8 new file mode 100644 index 0000000..1b666b9 --- /dev/null +++ b/performance/lmbench3/doc/bw_mmap_rd.8 @@ -0,0 +1,46 @@ +.\" $Id: bw_mmap_rd.8 1.2 00/10/16 17:13:37+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH BW_MMAP_RD 8 "$Date: 00/10/16 17:13:37+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +bw_mmap_rd \- time the reading and summing of a file +.SH SYNOPSIS +.B bw_mmap_rd +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I size +.I file +.SH DESCRIPTION +.B bw_mmap_rd +creates a memory mapping to the file and then reads the mapping in an unrolled +loop similar to that used in bw_mem_rd(8). +The benchmark is intended to be used on a file +that is in memory, i.e., the benchmark is a reread benchmark. Other +file benchmarking can be done with +.BR lmdd (8). +.LP +The size +specification may end with ``k'' or ``m'' to mean +kilobytes (* 1024) or megabytes (* 1024 * 1024). +.SH OUTPUT +Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., +.sp +.ft CB +8.00 25.33 +.ft +.SH MEMORY UTILIZATION +This benchmark should move approximately the reported amount of memory. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/bw_pipe.8 b/performance/lmbench3/doc/bw_pipe.8 new file mode 100644 index 0000000..ea8fdec --- /dev/null +++ b/performance/lmbench3/doc/bw_pipe.8 @@ -0,0 +1,59 @@ +.\" $Id: bw_pipe.8 1.2 00/10/16 17:13:38+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH BW_PIPE 8 "$Date: 00/10/16 17:13:38+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +bw_pipe \- time data movement through pipes +.SH SYNOPSIS +.B bw_pipe +[ +.I "-m <message size>" +] +[ +.I "-M <total bytes>" +] +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B bw_pipe +creates a Unix pipe between two processes and moves +.I "total bytes" +through the pipe in +.I "message size" +chunks (note that pipes are typically sized smaller than that). +The default +.I "total bytes" +is 10MB and the default +.I "message size" +is 64KB. +.SH OUTPUT +Output format is \f(CB"Pipe bandwidth: %0.2f MB/sec\\n", megabytes_per_second\fP, i.e., +.sp +.ft CB +Pipe bandwidth: 4.87 MB/sec +.ft +.SH MEMORY UTILIZATION +This benchmark can move up to six times the requested memory per process. +There are two processes, the sender and the receiver. +Most Unix +systems implement the read/write system calls as a bcopy from/to kernel space +to/from user space. Bcopy will use 2-3 times as much memory bandwidth: +there is one read from the source and a write to the destionation. The +write usually results in a cache line read and then a write back of +the cache line at some later point. Memory utilization might be reduced +by 1/3 if the processor architecture implemented "load cache line" +and "store cache line" instructions (as well as getcachelinesize). +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/bw_reread2.tbl b/performance/lmbench3/doc/bw_reread2.tbl new file mode 100644 index 0000000..d1e4347 --- /dev/null +++ b/performance/lmbench3/doc/bw_reread2.tbl @@ -0,0 +1,61 @@ +.KS +.TS +expand doublebox; +c|c c|c c +l|c c|c c +l|r r|r r. + Libc \fBFile\fP Memory File +System bcopy \fBread\fP read mmap += +DEC Alpha 38 37 64 12\ +DEC Alpha 45 40 79 50\ +DEC Alpha 36 36 55 19\ +DEC Alpha 40 44 69 14\ +DEC Alpha 46 48 88 26\ +DEC Alpha 39 39 76 23\ +SunOS-5.4 sun4m 23 31 59 31\ +SunOS-5.4 sun4m 26 23 80 30\ +SunOS-5.4 sun4d 14 23 36 25\ +SunOS-5.4 sun4d 21 23 47 17\ +Sun SC1000 15 20 38 28\ +DEC Alpha 15 20 46 14\ +Sun Ultra1 167 85 129 101\ +Linux alpha 40 25 74 23\ +Linux i586 31 17 61 14\ +SunOS-5.4 sun4m 23 21 64 39\ +Linux alpha 39 24 73 18\ +Unixware/i686 55 53 214 198\ +Linux i586 42 23 74 9\ +IBM Power2 171 187 205 106\ +IBM PowerPC 21 40 63 51\ +Linux i486 17 9 33 10\ +IRIX64 IP21 70 65 92 72\ +Linux i686 57 46 205 34\ +IRIX64-601 IP26 32 75 65 56\ +Linux i586 41 21 74 13\ +Linux i586 21 14 60 11\ +Linux i586 21 14 58 10\ +Linux i586 21 13 60 8\ +HP-UX 9000/735 26 47 55 36\ +HP-UX 9000/819 48 64 97 41\ +HP-UX 9000/755 25 45 49 32\ +FreeBSD/i586 42 38 65 49\ +FreeBSD/i586 42 30 73 54\ +FreeBSD/i586 41 29 65 46\ +IRIX64 IP19 34 34 65 56\ +FreeBSD/i586 40 28 62 47\ +IRIX64 IP25 41 60 87 76\ +HP-UX 9000/735 26 43 53 33\ +HP-UX 9000/735 26 43 54 34\ +HP-UX 9000/735 26 43 53 35\ +HP-UX 9000/770 33 43 56 37\ +HP-UX 9000/897 19 39 40 28\ +FreeBSD/i586 41 29 65 50\ +dgux mc88110 17 16 37 13\ +IRIX5.3 IP22 32 32 69 44\ +IRIX IP19 34 39 67 43\ +IRIX64 IP19 36 36 65 56\ +IRIX5.3 IP19 34 36 65 43\ +IRIX IP22 33 37 68 48\ +.TE +.KE diff --git a/performance/lmbench3/doc/bw_tcp.8 b/performance/lmbench3/doc/bw_tcp.8 new file mode 100644 index 0000000..b60d2fd --- /dev/null +++ b/performance/lmbench3/doc/bw_tcp.8 @@ -0,0 +1,71 @@ +.\" $Id: bw_tcp.8 1.3 00/10/16 17:13:39+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH BW_TCP 1 "$Date: 00/10/16 17:13:39+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +bw_tcp \- time data movement through TCP/IP sockets +.SH SYNOPSIS +.B bw_tcp +[ +.I "-m <message size>" +] +[ +.I "-M <total bytes>" +] +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I "server" +.br or +.B bw_tcp +.I -s +.br or +.B bw_tcp +.I "-S <server>" +.SH DESCRIPTION +.B bw_tcp +is a client/server program that moves data over a TCP/IP socket. Nothing is +done with the data on either side; +.I "total bytes" +of data is moved in +.I "message size" +chunks. +.LP +.B bw_tcp +has three forms of usage: as a server (-s), as a client (bw_tcp localhost), and +as a shutdown (bw_tcp -S localhost). +.LP +The default amount of data is 10MB. The client form may specify a different +amount of data. Specifications may end with ``k'' or ``m'' to mean +kilobytes (* 1024) or megabytes (* 1024 * 1024). +.SH OUTPUT +Output format is +.ft CB +Socket bandwidth using localhost: 2.32 MB/sec +.ft +.SH MEMORY UTILIZATION +This benchmark can move up to six times the requested memory per process +when run through the loopback device. +There are two processes, the sender and the receiver. +Most Unix +systems implement the read/write system calls as a bcopy from/to kernel space +to/from user space. Bcopy will use 2-3 times as much memory bandwidth: +there is one read from the source and a write to the destionation. The +write usually results in a cache line read and then a write back of +the cache line at some later point. Memory utilization might be reduced +by 1/3 if the processor architecture implemented "load cache line" +and "store cache line" instructions (as well as getcachelinesize). +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation +and Silicon Graphics, Inc. +.SH SEE ALSO +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/bw_tcp.tbl b/performance/lmbench3/doc/bw_tcp.tbl new file mode 100644 index 0000000..6bb1851 --- /dev/null +++ b/performance/lmbench3/doc/bw_tcp.tbl @@ -0,0 +1,57 @@ +.KS +.TS +center expand doublebox; +l r. +Linux alpha 8.9 +Linux i486 5.5 +Linux alpha 8.8 +Linux i586 3.2 +Linux i486 5.6 +Linux i586 2.9 +DEC Alpha 11.2 +Linux i586 3.0 +SunOS-5.4 sun4m 9.5 +SunOS-5.4 sun4m 11.0 +DEC Alpha 4.1 +DEC Alpha 6.6 +DEC Alpha 12.1 +Linux i586 3.0 +SunOS-5.4 sun4d 7.9 +SunOS-5.4 sun4d 9.1 +DEC Alpha 8.6 +DEC Alpha 6.0 +DEC Alpha 10.5 +Sun SC1000 10.9 +Linux i586 5.1 +DEC Alpha 9.2 +Linux i586 6.8 +FreeBSD/i586 0.1 +IRIX IP22 7.2 +Linux i686 14.7 +FreeBSD/i586 0.1 +SunOS-5.4 sun4m 19.5 +FreeBSD/i586 0.1 +Sun Ultra1 51.3 +FreeBSD/i586 0.2 +FreeBSD/i586 0.2 +IBM Power2 10.5 +IBM PowerPC 16.6 +dgux mc88110 4.6 +IRIX64 IP21 18.8 +IRIX IP19 16.4 +HP-UX 9000/735 18.4 +HP-UX 9000/735 19.0 +HP-UX 9000/735 23.9 +HP-UX 9000/897 16.9 +IRIX64-601 IP26 21.5 +IRIX5.3 IP22 22.1 +IRIX5.3 IP19 12.2 +IRIX64 IP19 18.8 +IRIX64 IP25 26.1 +IRIX64 IP19 30.8 +HP-UX 9000/770 20.5 +HP-UX 9000/819 27.7 +HP-UX 9000/755 35.2 +HP-UX 9000/735 19.6 +.TE +.KE diff --git a/performance/lmbench3/doc/bw_unix.8 b/performance/lmbench3/doc/bw_unix.8 new file mode 100644 index 0000000..1940e78 --- /dev/null +++ b/performance/lmbench3/doc/bw_unix.8 @@ -0,0 +1,48 @@ +.\" $Id: bw_unix.8 1.4 00/10/16 17:13:40+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH BW_UNIX 8 "$Date: 00/10/16 17:13:40+02:00 $" "(c)1994-2000 Larry McVoy and Carl Staelin" "LMBENCH" +.SH NAME +bw_unix \- UNIX pipe bandwidth +.SH SYNOPSIS +.B bw_unix +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I size +.SH DESCRIPTION +.B bw_unix +creates a pipe and forks a child process which keeps writing +data to the pipe as fast as it can. The benchmark measures +how fast the parent process can +.I read +the data in +.IR size -byte +chunks from the pipe. Nothing is done with the data in either +the parent (reader) or child (writer) processes. +.LP +The +.I size +specification may end with ``k'' or ``m'' to mean +kilobytes (* 1024) or megabytes (* 1024 * 1024). +.SH OUTPUT +Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., +.sp +.ft CB +8.00 25.33 +.ft +.SH "MEMORY UTILIZATION" +This benchmark should move approximately the reported amount of memory. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/cache.8 b/performance/lmbench3/doc/cache.8 new file mode 100644 index 0000000..15bdeb8 --- /dev/null +++ b/performance/lmbench3/doc/cache.8 @@ -0,0 +1,49 @@ +.\" $Id$ +.TH CACHE 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +cache \- cache parameters +.SH SYNOPSIS +.B cache +[ +.I "-L <line size>" +] +[ +.I "-M <len>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B cache +tries to determine the characteristics of the memory hierarchy. It +attempts to determine the number of caches, the size of each cache, +the line size for each cache, and the available memory parallelism at +each level in the memory hierarchy. +The largest amount of memory it will examine is +.I len +bytes. +.LP +.B cache +first attempts to determine the number and size of caches by measuring +the memory latency for various memory sizes. Once it has identified +the various caches it then measures the latency, parallelism, and line +size for each cache. Unfortunately, determining the cache size merely +from latency is exceedingly difficult due to variations in cache +replacement and prefetching strategies. +.SH BUGS +.B cache +is an experimental benchmark and is known to fail on many processors. +In particular there are a large number of machines with weird caching +behavior that confuse +.B cache +and prevent it from accurately determining the number and size of the +various caches. +.SH "SEE ALSO" +lmbench(8), line(8), tlb(8), par_mem(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/ctx.pic b/performance/lmbench3/doc/ctx.pic new file mode 100644 index 0000000..8e55781 --- /dev/null +++ b/performance/lmbench3/doc/ctx.pic @@ -0,0 +1,198 @@ +.sp .10i +.in +.07i +.PS +.ps 9 +.vs 9 +.ft CB +[ +# Variables, tweak these. + xtick = 2.000000 # width of an X tick + xlower = 0.000000 # where the xtick start + xupper = 22.000000 # upper range of graph + xn = 11 # number of ticks to do + ytick = 50.000000 # width of an Y tick + ylower = 0.000000 # where the ytick start + yupper = 450.000000 # upper range of graph + yn = 9 # number of ticks to do + xsize = 2.05 # width of the graph + ysize = 2.1 # height of the graph + yscale = ysize / (yupper - ylower) # scale data to paper + xscale = xsize / (xupper - xlower) # scale data to paper + tick = 0.10000000000000001 # distance towards numbers + gthk = .1 # thickness of grid lines + thk = 0.75 # thickness of data lines + qthk = 2.0 # thickness of quartile lines + vs = .10 # works for 10 point fonts + +# Draw the graph borders and tick marks + O: box thick 1.5 ht ysize wid xsize + j = ylower + t = tick * .5 + for i = 0 to yn by 1 do { + ys = j - ylower + g = ys * yscale + line thick 1.5 from O.sw + (-tick, g) to O.sw + (0, g) + + if (i < yn) then { + y2 = (ys + (ytick / 2)) * yscale + line thick .5 from O.sw + (-t, y2) to O.sw + (0, y2) + } + if (yupper - ylower > 999) then { + sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02) + } else { if (yupper - ylower > 10) then { + sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02) + } else { if (yupper - ylower > 1) then { + sprintf("%.1f", j) rjust at O.sw + (-.2, g - .02) + } else { + sprintf("%.2f", j) rjust at O.sw + (-.2, g - .02) + }}} + j = j + ytick + } + j = xlower + for i = 0 to xn by 1 do { + xs = j - xlower + g = xs * xscale + line thick 1.5 from O.sw + (g, -tick) to O.sw + (g, 0) + + if (i < xn) then { + x2 = (xs + (xtick / 2)) * xscale + line thick .5 from O.sw + (x2, 0) to O.sw + (x2, -t) + } + if (xupper - xlower > 999) then { + sprintf("%.0f", j) at O.sw + (g, -.25) + } else { if (xupper - xlower > 10) then { + sprintf("%.0f", j) at O.sw + (g, -.25) + } else { if (xupper - xlower > 1) then { + sprintf("%.1f", j) at O.sw + (g, -.25) + } else { + sprintf("%.2f", j) at O.sw + (g, -.25) + }}} + j = j + xtick + } + +# DATASET: Process size=0 overhead=10, MARK 0 +[ "\(ci" ] at O.sw + \ + (xscale * (2 - xlower), yscale * (6 - ylower)) +[ "\(ci" ] at O.sw + \ + (xscale * (4 - xlower), yscale * (7 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(ci" ] at O.sw + \ + (xscale * (8 - xlower), yscale * (7 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(ci" ] at O.sw + \ + (xscale * (16 - xlower), yscale * (8 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(ci" ] at O.sw + \ + (xscale * (20 - xlower), yscale * (8 - ylower)) +line thick thk from 2nd last [].c to last [].c + +# DATASET: Process size=4 overhead=19, MARK 1 +[ "\(sq" ] at O.sw + \ + (xscale * (2 - xlower), yscale * (7 - ylower)) +[ "\(sq" ] at O.sw + \ + (xscale * (4 - xlower), yscale * (8 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(sq" ] at O.sw + \ + (xscale * (8 - xlower), yscale * (9 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(sq" ] at O.sw + \ + (xscale * (16 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(sq" ] at O.sw + \ + (xscale * (20 - xlower), yscale * (12 - ylower)) +line thick thk from 2nd last [].c to last [].c + +# DATASET: Process size=16 overhead=66, MARK 2 +[ "\(*D" ] at O.sw + \ + (xscale * (2 - xlower), yscale * (14 - ylower)) +[ "\(*D" ] at O.sw + \ + (xscale * (4 - xlower), yscale * (15 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(*D" ] at O.sw + \ + (xscale * (8 - xlower), yscale * (18 - ylower)) +".12M" at O.sw + \ + (xscale * (8 - xlower), .12 + yscale * (18 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(*D" ] at O.sw + \ + (xscale * (16 - xlower), yscale * (46 - ylower)) +".25M" at O.sw + \ + (xscale * (16 - xlower), .12 + yscale * (46 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(*D" ] at O.sw + \ + (xscale * (20 - xlower), yscale * (88 - ylower)) +line thick thk from 2nd last [].c to last [].c + +# DATASET: Process size=32 overhead=129, MARK 3 +[ "\(mu" ] at O.sw + \ + (xscale * (2 - xlower), yscale * (22 - ylower)) +[ "\(mu" ] at O.sw + \ + (xscale * (4 - xlower), yscale * (24 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(mu" ] at O.sw + \ + (xscale * (8 - xlower), yscale * (107 - ylower)) +".25M" at O.sw + \ + (xscale * (8 - xlower), .12 + yscale * (107 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(mu" ] at O.sw + \ + (xscale * (16 - xlower), yscale * (187 - ylower)) +".5M" at O.sw + \ + (xscale * (16 - xlower), .12 + yscale * (187 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(mu" ] at O.sw + \ + (xscale * (20 - xlower), yscale * (188 - ylower)) +line thick thk from 2nd last [].c to last [].c + +# DATASET: Process size=64 overhead=255, MARK 4 +[ "\s+4\(bu\s0" ] at O.sw + \ + (xscale * (2 - xlower), yscale * (38 - ylower)) +".12M" at O.sw + \ + (xscale * (2 - xlower), .12 + yscale * (38 - ylower)) +[ "\s+4\(bu\s0" ] at O.sw + \ + (xscale * (4 - xlower), yscale * (140 - ylower)) +".25M" at O.sw + \ + (xscale * (4 - xlower) - .14, .12 + yscale * (140 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\s+4\(bu\s0" ] at O.sw + \ + (xscale * (8 - xlower), yscale * (363 - ylower)) +".5M" at O.sw + \ + (xscale * (8 - xlower), .12 + yscale * (363 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\s+4\(bu\s0" ] at O.sw + \ + (xscale * (16 - xlower), yscale * (367 - ylower)) +"1M" at O.sw + \ + (xscale * (16 - xlower), .12 + yscale * (367 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\s+4\(bu\s0" ] at O.sw + \ + (xscale * (20 - xlower), yscale * (367 - ylower)) +line thick thk from 2nd last [].c to last [].c + +# Xaxis title. +"\s+1Processes\s0" rjust at O.se - (-.15, .6) + +# Yaxis title (Time in microseconds) +.ps +1 +"T" "i" "m" "e" " " "i" "n" at O.w - (.85, 0) +"m" "i" "c" "r" "o" "s" "e" "c" "o" "n" "d" "s" at O.w - (.68, 0) +.ps + +# Graph title. +.vs 12 +"\s+2Context switches for" "Linux i686@167Mhz\s0" at O.n + (-.5, .4) +.vs + +# Title. +[ "\(ci" ] at O.sw - (.80, .50 + 0 * vs) +"size=0KB \ overhead=10" ljust at last [].e + (.1, 0) +[ "\(sq" ] at last [] - (0, vs) +"size=4KB \ overhead=19" ljust at last [].e + (.1, 0) +[ "\(*D" ] at last [] - (0, vs) +"size=16KB overhead=66" ljust at last [].e + (.1, 0) +[ "\(mu" ] at last [] - (0, vs) +"size=32KB overhead=129" ljust at last [].e + (.1, 0) +[ "\s+4\(bu\s0" ] at last [] - (0, vs) +"size=64KB overhead=255" ljust at last [].e + (.1, 0) +] +.ft +.ps +.in +.PE diff --git a/performance/lmbench3/doc/ctx.tbl b/performance/lmbench3/doc/ctx.tbl new file mode 100644 index 0000000..b3fdb1a --- /dev/null +++ b/performance/lmbench3/doc/ctx.tbl @@ -0,0 +1,63 @@ +.KS +.TS +expand doublebox; +c|c s|c s +l|c c|c c +l|r r|r r. + 2 processes 8 processes +System \fB0KB\fP 32KB 0KB 32KB += +Linux alpha 10 17 13 41\ +Linux i486 11 394 18 594\ +Linux alpha 11 73 13 92\ +Linux i486 -1 70 -1 78\ +Linux i586 10 163 13 215\ +DEC Alpha 25 18 42 21\ +SunOS-5.4 sun4m 37 128 52 73\ +DEC Alpha 39 55 46 112\ +DEC Alpha 53 50 56 62\ +DEC Alpha 53 66 59 93\ +DEC Alpha 59 68 115 134\ +DEC Alpha 14 27 22 159\ +DEC Alpha 40 42 46 205\ +Sun Ultra1 14 27 20 73\ +Unixware/i686 21 22 \ +DEC Alpha 43 142 45 197\ +SunOS-5.4 sun4m 54 65 85 102\ +SunOS-5.4 sun4m 75 31 110 102\ +IBM Power2 13 16 18 43\ +HP-UX 9000/819 13 41 15 109\ +HP-UX 9000/755 25 29 29 220\ +HP-UX 9000/735 29 39 31 204\ +HP-UX 9000/735 29 42 34 205\ +HP-UX 9000/735 29 32 30 164\ +Linux i586 36 163 47 222\ +Linux i686 6 22 7 107\ +Linux i586 13 178 20 273\ +Linux i586 13 182 21 232\ +Linux i586 16 218 22 266\ +Linux i586 66 240 83 347\ +Sun SC1000 107 135 104 362\ +SunOS-5.4 sun4d 137 245 164 486\ +SunOS-5.4 sun4d 224 113 245 134\ +FreeBSD/i586 28 67 34 158\ +IRIX5.3 IP22 40 47 38 104\ +IBM PowerPC 16 87 26 144\ +FreeBSD/i586 30 54 36 137\ +FreeBSD/i586 24 54 28 137\ +IRIX64 IP21 84 104 87 101\ +dgux mc88110 89 119 122 263\ +HP-UX 9000/897 20 39 23 111\ +HP-UX 9000/735 27 37 30 222\ +FreeBSD/i586 29 41 35 123\ +FreeBSD/i586 29 -13 36 78\ +IRIX IP22 38 50 42 74\ +IRIX64-601 IP26 72 92 74 93\ +IRIX64 IP19 59 68 79 91\ +IRIX64 IP25 55 77 59 85\ +IRIX64 IP19 63 80 69 93\ +IRIX IP19 141 150 96 115\ +HP-UX 9000/770 21 24 21 218\ +IRIX5.3 IP19 150 157 102 167\ +.TE +.KE diff --git a/performance/lmbench3/doc/description.ms b/performance/lmbench3/doc/description.ms new file mode 100644 index 0000000..91d2b23 --- /dev/null +++ b/performance/lmbench3/doc/description.ms @@ -0,0 +1,531 @@ +.\" $X$ xroff -mgs $file +.\" $tty$ groff -mgs $file | colcrt - | more +.\" $lpr$ groff -mgs $file > ${file}.lpr +.\" Define a page top that looks cool +.de PT +.if \\n%>1 \{\ +. sp -.1i +. ps 14 +. ft 3 +. nr big 24 +. nr space \\w'XXX' +. nr titlewid \\w'\\*[title]' +. nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 +. ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' +. ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 +. ce 1 +\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] +. ps +. sp -.70 +. ps 12 +\\l'\\n[LL]u' +. ft +. ps +.\} +.. +.\" Define a page bottom that looks cool +.de BT +. ps 9 +\v'-1'\\l'\\n(LLu' +. sp -1 +. tl '\(co 1994 \\*[author]'\\*(DY'%' +. ps +.. +.\" Configuration +.VARPS +.nr HM 1.0i +.nr FM 1i +.if t .nr PO .75i +.if t .nr LL 7.0i +.if n .nr PO .25i +.if n .nr LL 7.5i +.nr PS 11 +.nr VS \n(PS+2 +.ds title Portable Tools for Performance Analysis +.ds author Larry McVoy +.TL +lmbench: +.sp .5 +\*[title] +.br +\s8Revision $Revision: 1.4 $ of $Date: 94/11/23 18:02:12-08:00 $\s0 +.AU +\*[author] +.AI +.ps -2 +lm@xxxxxxx\** +(415) 390-1804 +.ps +2 +.AB +A description of a set benchmarks for measuring system performance. +The benchmarks include latency measurements of basic system operations +such as memory, processes, networking, and disks, and bandwidth measurements +of memory, disks, and networking. +The benchmarks have been run under a wide variety of Unix systems. +The benchmarks are freely distributed under +the GNU General Public License, with the additional restriction +that results may be reported only if the benchmarks are unmodified. +.AE +.sp 2 +.if t .2C +.FS +This work was mostly done while the author was an employee of Sun Microsystems +Computer Corporation. +.FE +.NH 1 +Introduction +.LP +The purpose of this project is to provide the computer community with tools +for performance analysis of basic operations of their computer systems. +The tools are designed +to be both portable and comparable over a wide set of Unix systems.\** +.FS +The tools have been run on +AIX, +BSDI, +HP-UX, +IRIX, +Linux, +NetBSD, +OSF/1, +Solaris, +and +SunOS by the author. +.FE +The interfaces that the tools use have been carefully chosen to be as portable +and standard as possible. It is an explicit intent of the benchmark to measure +standard interfaces. Users of this benchmark may not report results from +modified versions of the benchmarks.\** +.FS +For example, the context switch benchmark may not use a \f(CWyield()\fP +primitive instead of pipes; the networking benchmarks must use the socket +interfaces, not TLI or some other interface. +.FE +.PP +The purpose of +this document is to describe each of the benchmarks. +.PP +The benchmarks are loosely divided into latency, bandwidth, and ``other'' +categories. +.NH 1 +Latency measurements +.LP +The latency measurements included in this suite are process creation times +(including address space extension via mmap()), +basic operating system entry cost, context switching, inter process +communication, file system latency, +disk latency (you must be the super user to get +disk latency results), and memory latency. +.PP +Process benchmarks are used to measure the basic process primitives, +such as creating a new process, running a different program, and context +switching. Process creation benchmarks are of particular interest +to distributed systems since many remote operations include the creation +of a remote process to shepherd the remote operation to completion. +Context switching is important for the same reasons. +.PP +Inter process communication latency is important because many operations +are control messages that tell another process (frequently on another +system) to do something. The latency of telling the remote process to +do something is pure overhead and is frequently in the critical path +of important functions, such as distributed databases.\** +.FS +The performance of the TCP latency benchmark has proven to be a good +estimate of the performance of the Oracle database lock manager. +.FE +.PP +The inter process communication latency benchmarks are roughly the same +idea: pass a small message (a byte or so) back and forth between two +processes. The reported results are always the microseconds it takes +to do one round trip. If you are interested in a one way timing, then +about half the round trip is right (however, the CPU cycles tend to be +somewhat asymmetric for a one trip). +.NH 2 +Process forks/exits +.LP +Create a child process which does nothing but +terminate. Results are reported in creations per second. +The benchmark is measuring how fast the OS can create a new address +space and process context. +The child process is spawned via the \f(CBfork\fP() interface, +not the \f(CBvfork\fP() interface. +.NH 2 +Simple process creates I +.LP +Create a child process which then runs a new program that does nothing +but print ``hello world'' and exit. The difference between this +benchmark and the previous is the running of a new program. The +time difference between this and the previous benchmark is the cost +of starting a new (simple) program. That cost is especially noticeable +on (some) systems that have shared libraries. Shared libraries can +introduce a substantial (10s of milliseconds) start up cost. This +benchmark is intended to quantify the time/space tradeoff of shared +libraries. +.NH 2 +Simple process creates II +.LP +Create a child process which runs the same new program except that the +program is started by the system shell. This is a clone of the C +library \f(CBsystem\fP() interface. The intent is to educate users +about the cost of this interface. I have long felt that using the +Bourne shell, especially a dynamically linked Bourne shell, to start up +processes is over kill; perhaps these numbers will convince others of the +same thing. A better choice would be Plan 9's \f(CBrc\fP shell (which +is, by the way, free software). +.NH 2 +Memory mapping +.LP +Memory mapping is the process of making a file part of a process' address +space, allowing direct access to the file's pages. It is an alternative +to the traditional read and write interfaces. Memory mapping is extensively +used for linking in shared libraries at run time. This benchmark measures +the speed at which mappings can be created as well as removed. Results +are reported in mappings per second, and the results can be graphed as the +test is run over a series of different sizes. +.NH 2 +Context switches +.LP +Measures process context switch time.\** A context switch is defined as +the time it takes to save the state of one process and restore the state +of another process. +Typical context switch benchmarks measure just the minimal context switch +time, i.e., the time to switch between two processes that are doing nothing +but context switching. That approach is misleading because systems may +have multiple active processes and the processes typically have more state +(hot cache lines) than just the code required to force another context +switch. This benchmark takes that into consideration and varies both +the number and the size of the processes. +.FS +A previous version of this benchmark included several system calls +in addition to the context switch, resulting in grossly over inflated +context switch times. +.FE +.PP +The benchmark is a ring of two to twenty processes that are connected +with Unix pipes. A token is passed from process to process, forcing +context switches. The benchmark measures the time it takes to pass +the token two thousand times from process to process. Each hand off +of the token has two costs: (a) the context switch, and (b) the cost +of passing the token. In order to get just the context switching time, +the benchmark first measures the cost of passing the token through a +ring of pipes in a single process. This time is defined as the cost +of passing the token and is not included in the reported context switch +time. +.PP +When the processes are larger than the default baseline of ``zero'' +(where zero means just big enough to do the benchmark), the cost +of the context switch includes the cost of restoring user level +state (cache lines). This is accomplished by having the process +allocate an array of data and sum it as a series of integers +after receiving the token but before passing the token to the +next process. Note that the overhead mentioned above includes +the cost of accessing the data but because it is measured in +just one address space, the cost is typically the cost with hot +caches. So the context switch time does not include anything +other than the context switch provided that all the processes +fit in the cache. If there are cache misses (as is common), the +cost of the context switch includes the cost of those cache misses. +.PP +Results for an HP system running at 100 mhz are shown below. +This is a particularly nice system for this benchmark because the +results are quite close to what is expected from a machine with a +256KB cache. As the size and number of processes are both increased, +processes start falling out of the cache, resulting in higher context +switch times. +.LP +.so ctx.pic +.NH 2 +Null system calls +.LP +Measures the cost of entering and exiting (without pausing) the +operating system. This is accomplished by repeatedly writing one byte +to \f(CB/dev/null\fP, a pseudo device driver that does nothing but +discard the data. Results are reported as system calls per second. +.PP +It is important to note that the system call chosen actually does the +work on all systems, to the best of my knowledge. There are some +systems that optimized trivial system calls, such as \f(CBgetpid\fP(), +to return the answer without a true entry into the OS proper. Writing +to \f(CB/dev/null\fP has not been optimized. +.NH 2 +Pipe latency +.LP +This benchmark measures the OS; there is almost no code executed at +user level. The benchmark measures the round trip time of a small message +being passed back and forth between two processes through a pair of +Unix pipes. +.NH 2 +TCP/IP latency +.LP +This benchmark measures the OS +networking code and the driver code; there is almost no code executed at +user level. The benchmark measures the round trip time of a small message +being passed back and forth between two processes through an AF_INET +socket. Note that both remote and local results may be reported. +.NH 2 +UDP/IP latency +.LP +This benchmark measures the OS +networking code and the driver code; there is almost no code executed at +user level. The benchmark measures the round trip time of a small message +being passed back and forth between two processes through an AF_INET socket. +Note that both remote +and local results may be reported. +.LP +It is interesting to note that the TCP performance is sometimes +greater than the UDP performance. +This is contrary to expectations since +the TCP protocol is a reliable, connection oriented protocol, and as such +is expected to carry more overhead. +Why this is so is an exercise left to the +reader. +.NH 2 +RPC latency (TCP and UDP) +.LP +Actually two latency benchmarks: Sun RPC over TCP/IP and over UDP/IP. +This benchmark consists of the user level RPC code layered over the TCP +or UDP sockets. The benchmark measures the round trip time of a small +message being passed back and forth between two processes. Note that +both remote and local results may be reported. +.LP +Using the TCP or the UDP benchmarks as a baseline, it +is possible to see how much the RPC code is costing. +.NH 2 +TCP/IP connect latency +.LP +This benchmarks measures the time it takes to get a TCP/IP socket and +connect it to a remote server. +.NH 2 +File system latency +.LP +A benchmark that measures how fast the file system can do basic, common +operations, such as creates and deletes of small files. +.NH 2 +Page fault latency +.LP +A benchmark that measures how fast the file system can pagefault in a +page that is not in memory. +.NH 2 +Disk latency +.LP +A benchmark that is designed to measure the overhead of a disk +operation. Results are reported as operations per second. +.PP +The benchmark is designed with SCSI disks in mind. It actually simulates +a large number of disks in the following way. The benchmark reads 512 byte +chunks sequentially from the raw disk device (raw disks are unbuffered +and are not read ahead by Unix). The benchmark ``knows'' that most +disks have read ahead buffers that read ahead the next 32-128 kilobytes. +Furthermore, the benchmark ``knows'' that the disks rotate and read ahead +faster than the processor can request the chunks of data.\** +.FS +This may not always be true - a processor could be fast enough to make the +requests faster than the rotating disk. If we take 3MB/sec to be disk +speed, a fair speed, and divide that by 512, that is 6144 IOs/second, or +163 microseconds per IO. I don't know of any processor/OS/io controller +combinations that can do an +IO in 163 microseconds. +.FE +So the benchmark is basically reading small chunks of data from the +disks track buffer. Another way to look at this is that the benchmark +is doing memory to memory transfers across a SCSI channel. +.PP +No matter how you look at it, the resulting number represents a +\fBlower\fP bound on the overhead of a disk I/O. In point of fact, +the real numbers will be higher on SCSI systems. Most SCSI controllers +will not disconnect if the request can be satisfied immediately; that is +the case here. In practice, the overhead numbers will be higher because +the processor will send the request, disconnect, get interrupted, +reconnect, and transfer. +.PP +It is possible to generate loads of upwards of 500 IOPs on a single +SCSI disk using this technique. It is useful to do that to figure out +how many drives could be supported on a system before there are no +more processor cycles to handle the load. Using this trick, you +do not have to hook up 30 drives, you simulate them. +.NH 2 +Memory read latency +.LP +This is perhaps the most interesting benchmark in the suite. The +entire memory hierarchy is measured, including onboard cache latency +and size, external cache latency and size, main memory latency, and TLB +miss latency. +.PP +The benchmark varies two parameters, array size and array stride. +For each size, a list of pointers is created for all of the different +strides. Then the list is walked like so +.DS +.ft CB +mov r0,(r0) # C code: p = *p; +.DE +The time to do about fifty thousand loads (the list wraps) is measured and +reported. The time reported is pure latency time and may be zero even though +the load instruction does not execute in zero time. Zero is defined as one +clock cycle; in other words, the time reported is \fBonly\fP memory latency +time, as it does not include the instruction execution time. It is assumed +that all processors can do a load instruction (not counting stalls) in one +processor cycle. In other words, if the processor cache load time +is 60 nanoseconds on a 20 nanosecond processor, the load latency reported +would be 40 nanoseconds, the missing 20 seconds is for the load instruction +itself. Processors that can manage to get the load address out to the +address pins before the end of the load cycle get some free time in this +benchmark (I don't think any processors can do that). +.PP +Note that this benchmark has been validated by logic analyzer measurements +on an SGI indy. The +clever reader might realize that last few nanoseconds of inaccuracy could be +rounded off by realizing that the latency is always going to be a multiple +of the processor clock rate. +.PP +The raw data is a series of data sets. Each data set is a stride size, +with array size varied from about one kilobyte up to eight megabytes. +When these data sets are all plotted together (using a log base 2 scale +for the size variable), the data will be seen to contain a series of +horizontal plateaus. The first is the onboard data cache latency (if there +is an onboard cache). The point where the lines start to go up marks the +size of the cache. The second is the external cache, the third is the +main memory, and the last is main memory plus TLB miss cost. In addition +to this information, the cache line size can be derived by noticing which +strides are faster than main memory times. The first stride that is +main memory speed is likely to be the cache line size. The reason is +that the strides that are faster than memory indicate that the benchmark is +getting more than one hit per cache line. Note that prefetching may confuse +you. +.PP +The graph below shows a particularly nicely made machine, a DEC alpha. +This machine is nice because (a) it shows the latencies and sizes of +the on chip level 1 and motherboard level 2 caches, and (b) because it +has the best all around numbers, especially considering it can support a +4MB level 2 cache. Nice work, DEC. +.so mem.pic +.NH 1 +Bandwidth measurements +.LP +One of my former managers\** once noted that ``Unix is Swahili for bcopy().'' +I believe that he was indicating his belief that the operating system spent +most of its time moving data from one place to another, via various means. +I tend to agree and have measured the various ways that data can be moved. +The ways that are measured are: through pipes, TCP sockets, library bcopy() +and hand unrolled bcopy(), the read() interface, through the mmap() interface, +and direct memory read and write (no copying). +.FS +Ken Okin +.FE +.NH 2 +Pipe bandwidth +.LP +Bandwidth measurement between two local processes communicating through +a Unix pipe. Results are in megabytes per second. +.NH 2 +TCP/IP socket bandwidth +.LP +Bandwidth measurement using TCP/IP sockets. Results are reported in megabytes +per second. +Results are reported for local, ethernet, FDDI, and ATM, where possible. +Results range from 1-10+ megabytes per second. Any system delivering +more than 10 MB/second over TCP is doing very well by 1994 standards. +.PP +Note that for local measurements, the system is actually moving +twice as much data, since the data is being moved to/from the same host. +.PP +Local bandwidths are (sometimes) useful for determining the overhead of the +protocol stack (as well as other OS tasks, such as context switching). +Note, however, that some implementations (such as Solaris 2.x) have +``fast pathed'' loopback IP which skews the results. The fast path +uses a larger MTU and does not do checksums. +.PP +The sockets are configured to use the largest receive/send buffers that the OS +will allow. This is done to allow maximum bandwidth. Sun's 4.x TCP/IP +subsystem (and probably BSD's as well) default to 4KB send/receive buffers, +which is too small. (It would be better if the OS noted that this was a +high volume / high bandwidth connection and automatically grew the buffers. +Hint, hint.) +.NH 2 +bcopy bandwidths +.LP +A simple benchmark that measures how fast data can be copied. A hand +unrolled version and the C library version are tested. Results are +reported in megabytes per second. Note that a typical system is actually +moving about three times as much memory as the reported result. A copy +is actually a read, a write which causes a cache line read, and a write +back. +.NH 2 +Read bandwidth +.LP +Most VM system cache file pages for reuse. This benchmark measures the +speed at which those pages can be reused. It is important to notice +that this is not a disk read measurement, it is a memory read measurement. +Results are reported in megabytes per second. +.NH 2 +Mmap read bandwidth +.LP +The same measurement as the previous benchmark except that it maps the +file, avoiding the copy from kernel to user buffer. +Results are reported in megabytes per second. +.NH 2 +Memory read bandwidth +.LP +A large array is repeatedly read sequentially. +Results reported in megabytes per second. +.NH 2 +Memory write bandwidth +.LP +A large array is repeatedly written sequentially. +Results reported in megabytes per second. +.NH 1 +Other measurements +.LP +.NH 2 +Processor cycle time +mhz +.LP +Calculates the megahertz and clock speed of the processor. This is the +standard loop in which a series of interlocked operations are timed, +and then the megahertz is derived from the timing. The operations +are purposefully interlocked to overcome any super scalerness of the +system under test. +.PP +There are actually three versions of mhz, a generic one that works on +most systems, and two specific versions for SuperSPARC and rs6000 +systems. +.PP +It turns out that the +SuperSPARC processor has two ALU's that are run at twice the clock rate, +allowing two interlocked operations to complete in one processor clock.\** +.FS +Credit and thanks to John Mashey of SGI/MIPS fame, who kindly took the +time to out why the benchmark wasn't working on SuperSPARC +systems. He explained the SuperSPARC pipeline and the solution to the +problem. +.FE +Fortunately, the ALU's are asymmetric and can not do two shifts in +one processor clock. Shifts are used on SuperSPARC systems. +.PP +IBM rs6000 systems have a C compiler that does not honor the +``register'' directive in unoptimized code. The IBM loop looks +like it is doing half as many instructions as the others. This +is on purpose, each add on the IBM is actually two instructions +(I think it is a load/add/store or something like that). +.NH 1 +Acknowledgments +.LP +I would like to acknowledge Sun Microsystems for supporting the development +of this project. In particular, my personal thanks to Paul Borrill, +Director of the Architecture and Performance group, for conceiving and +supporting the development of these benchmarks. +.PP +My thanks to John Mashey and Neal Nuckolls of Silicon Graphics for reviews, +comments, and explanations of the more obscure problems. +.PP +My thanks to Satya Nishtala of Sun Microsystems for (a) listening to me +complain about memory latencies over and over, (b) doing something about +it in future SPARC systems, and (c) reviewing the memory latency results +and explained IBM's sub blocking scheme (I still don't really understand +it but he does. Ask him). +.NH 1 +Obtaining the benchmarks +.LP +The benchmarks will be posted to the Usenet comp.benchmarks group. In addition, +mail sent to \f(CBarchives@xxxxxxxxxxxxxxxxxxx\fP with a request for +\f(CBlmbench.shar\fP +sources will get the latest and greatest. diff --git a/performance/lmbench3/doc/graph.1 b/performance/lmbench3/doc/graph.1 new file mode 100644 index 0000000..64a5cb3 --- /dev/null +++ b/performance/lmbench3/doc/graph.1 @@ -0,0 +1,143 @@ +.\" $Id: graph.1 1.2 94/12/27 17:50:18-08:00 lm@xxxxxxxxxxxxxxx $ +.de DS +. sp .5 +. nf +. in +4 +. ft CW +. vs -1 +.. +.de DE +. sp .5 +. fi +. in +. ft +. vs +.. +.TH GRAPH 1 +.SH NAME +graph \- compile graphs into pic input +.SH SYNOPSIS +.B graph +[ options ] +[ +.I filename +\&.\|.\|. +] +.SH DESCRIPTION +.LP +.B graph +is a perl script which +takes sets of X Y data and generates a (human readable) pic program +that will produce the graphed data. The output is designed such that +you can save it in a file and tweak it to make it fit your document. +Try one and look at the output. The output is actually commented. +.LP +The graph is autosized and auto ticked. +.LP +The input data format is similar +that of xgraph(1), i.e., +.DS +1 1 +2 2 +3 3 +"sloped across + +1 4 +2 4 +3 4 +"straight across +.DE +.SH "CONTROL OPTIONS" +.LP +You may set the graph title, the X title, and the Y title with the +following control sequences in the data stream: +.DS +%T Graph title in +4 point font +%X X axis title and/or units in +2 point font +%Y Y axis title and/or units in +2 point font +%fakemax-X <value> force graph to be that big +%fakemax-Y <value> force graph to be that big +%fakemin-X <value> force graph to be that small +%fakemin-Y <value> force graph to be that small +.DE +.SH OPTIONS +.IP -rev 12 +reverse X/Y data sense (and titles). Note this is done after processing +any fudging of the input data stream(s) (see -xk, -yk, -logx, etc below). +.IP -below +put data set titles below the graph rather than to the right. +.IP -close +no extra space around the data's endpoints. +.IP -qline +connect the quartile center points. +.IP -grid +dotted line grid marks. +.IP -nobox +no box around whole graph. +.IP -big +make the graph take the whole page. +.IP -medium +make the graph take about 1/2 the page. +.IP -small +make the graph be small. +.IP -grapheach +draw each data set in its own graph. +.IP -nolabels +no X/Y/Title labels. +.IP -nodatal +no data set labels. +.IP -nomarks +do not mark each data point with distinct markers (endpoints are still +marked). +.IP -k +print values larger than 1000 as value/1000. +.IP -xk +multiply X input by 1024 (blech). +.IP -yk +multiply Y input by 1024 (blech). +.IP -xm +multiply X input by 1024*1024 (blech). +.IP -ym +multiply Y input by 1024*1024 (blech). +.IP -logx +convert X input into lag base 2 of X input. +.IP -logy +convert Y input into lag base 2 of Y input. +.SH EXAMPLE +Workstation price performance from a Digital ad. Process with +.DS +.ps -2 +graph -rev workstations | groff -TX75 + +"%T Workstation Price / Performance, 6/93 +"%X SPECINT 92 Performance +"%Y Price in $1000's +35 5 +65 10 +78 15 +110 70 +"Dec AXP line + +25 4 +25 8 +38 16 +48 21 +52 23 +64 27 +"Sun SPARC line +.DE +.ps +.SH "QUARTILE FORMAT" +Data points are \f(CBx y1 y2 y3 y4 y5\fP. You get a two lines from the +first two y values, a mark at the third, and another line from the last two. +.SH "SEE ALSO" +.BR gtroff (1), +.BR gpic (1), +.BR perl (1). +.SH BUGS +This should probably be called pic_graph or something like that. +.LP +This isn't done as much as I would like. +It isn't integrated with the groff preprocessor yet. +It doesn't know about .GS/.GE things. I use it to manually generate +a pic file and then include that. diff --git a/performance/lmbench3/doc/lat_allmem.tbl b/performance/lmbench3/doc/lat_allmem.tbl new file mode 100644 index 0000000..8594cb9 --- /dev/null +++ b/performance/lmbench3/doc/lat_allmem.tbl @@ -0,0 +1,62 @@ +.KS +.TS +expand doublebox; +l c c c +l c c c +l r r r. + Level 1 Level 2 Main +System cache cache memory += +Linux i586 8 103 151\ +DEC Alpha 12 67 291\ +Linux i586 8 107 150\ +DEC Alpha 10 56 321\ +Unixware/i686 14 34 196\ +DEC Alpha 9 51 288\ +DEC Alpha 7 47 458\ +DEC Alpha 12 57 468\ +SunOS-5.4 sun4m 13 -- 180\ +SunOS-5.4 sun4m 20 -- 291\ +SunOS-5.4 sun4m 16 115 816\ +Sun Ultra1 6 42 270\ +SunOS-5.4 sun4d 16 116 995\ +IBM Power2 -- 13 141\ +IBM PowerPC 6 164 394\ +DEC Alpha 10 53 477\ +FreeBSD/i586 10 115 179\ +FreeBSD/i586 7 111 181\ +DEC Alpha 13 104 957\ +FreeBSD/i586 10 118 180\ +FreeBSD/i586 10 101 180\ +HP-UX 9000/735 -- 10 347\ +Sun SC1000 20 140 1236\ +HP-UX 9000/770 -- 9 376\ +SunOS-5.4 sun4d 24 173 1246\ +Linux i686 12 90 194\ +Linux i586 10 190 320\ +Linux i586 10 148 320\ +Linux i586 10 198 321\ +Linux i586 10 222 321\ +Linux i486 12 234 336\ +Linux alpha 3 83 354\ +Linux alpha 3 43 361\ +DEC Alpha 3 42 396\ +HP-UX 9000/735 -- 10 348\ +IRIX5.3 IP22 10 76 1018\ +IRIX64 IP25 8 58 1134\ +HP-UX 9000/735 -- 10 347\ +HP-UX 9000/897 -- 11 424\ +HP-UX 9000/819 -- 10 430\ +IRIX64 IP21 11 100 709\ +IRIX64 IP19 10 75 1150\ +IRIX IP19 8 64 1189\ +IRIX5.3 IP19 10 75 1149\ +IRIX64 IP19 10 70 1152\ +IRIX IP22 8 64 1170\ +FreeBSD/i586 10 106 181\ +HP-UX 9000/735 -- 10 348\ +HP-UX 9000/755 -- 10 393\ +dgux mc88110 22 319 753\ +IRIX64-601 IP26 13 120 1244\ +.TE +.KE diff --git a/performance/lmbench3/doc/lat_allproc.tbl b/performance/lmbench3/doc/lat_allproc.tbl new file mode 100644 index 0000000..d1aee27 --- /dev/null +++ b/performance/lmbench3/doc/lat_allproc.tbl @@ -0,0 +1,60 @@ +.KS +.TS +expand doublebox; +l|c|c|c +l|r|r|r. + fork \fBfork, exec\fP fork, exec +System & exit \fB& exit\fP sh -c & exit += +DEC Alpha 4.6 13\ 42\ +DEC Alpha 3.3 11\ 44\ +Linux alpha 0.7 3\ 12\ +Linux alpha 1.0 2\ 16\ +DEC Alpha 2.0 6\ 43\ +DEC Alpha 4.8 16\ 64\ +Linux i686 0.5 5\ 17\ +DEC Alpha 3.1 10\ 281\ +Linux i586 0.9 5\ 16\ +DEC Alpha 5.3 14\ 27\ +DEC Alpha 5.1 15\ 89\ +Sun Ultra1 3.7 20\ 10\ +SunOS-5.4 sun4m 8.0 46\ 237\ +SunOS-5.4 sun4m 18.0 83\ 37\ +SunOS-5.4 sun4m 10.7 57\ 87\ +Linux i486 3.3 10\ 112\ +Linux i586 1.6 12\ 44\ +SunOS-5.4 sun4d 13.7 75\ 113\ +IBM Power2 1.2 8\ 16\ +IBM PowerPC 2.9 8\ 50\ +SunOS-5.4 sun4d 20.8 93\ 136\ +HP-UX 9000/735 1.3 3\ 17\ +IRIX5.3 IP19 4.3 8\ 20\ +IRIX5.3 IP22 3.1 8\ 19\ +IRIX64-601 IP26 4.6 24\ 39\ +IRIX IP22 3.0 8\ 22\ +Linux i586 2.4 9\ 26\ +Linux i586 1.8 15\ 30\ +Linux i586 1.9 15\ 30\ +Linux i586 3.1 24\ 73\ +DEC Alpha 13.4 33\ 39\ +Sun SC1000 14.0 69\ 175\ +FreeBSD/i586 2.9 14\ 22\ +FreeBSD/i586 2.7 13\ 21\ +IRIX64 IP21 4.2 14\ 30\ +HP-UX 9000/770 3.1 9\ 18\ +FreeBSD/i586 2.8 13\ 22\ +HP-UX 9000/735 3.5 10\ 20\ +HP-UX 9000/735 3.5 10\ 19\ +IRIX64 IP19 4.5 19\ 37\ +HP-UX 9000/819 4.2 67\ 118\ +HP-UX 9000/755 3.6 10\ 18\ +HP-UX 9000/897 6.7 15\ 37\ +IRIX IP19 6.2 19\ 46\ +HP-UX 9000/735 3.5 10\ 20\ +FreeBSD/i586 2.7 12\ 20\ +FreeBSD/i586 3.0 14\ 23\ +IRIX64 IP25 3.3 12\ 24\ +IRIX64 IP19 4.0 14\ 24\ +dgux mc88110 8.8 13\ 67\ +.TE +.KE diff --git a/performance/lmbench3/doc/lat_connect.8 b/performance/lmbench3/doc/lat_connect.8 new file mode 100644 index 0000000..11a7912 --- /dev/null +++ b/performance/lmbench3/doc/lat_connect.8 @@ -0,0 +1,47 @@ +.\" $Id: lat_connect.8 1.2 00/10/16 17:13:41+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_CONNECT 8 "$Date: 00/10/16 17:13:41+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_connect \- measure interprocess connection latency via TCP/IP +.SH SYNOPSIS +.B lat_connect +.I -s +.sp .5 +.B lat_connect +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I hostname +.sp .5 +.B lat_connect +.I "-S hostname" +.SH DESCRIPTION +.B lat_connect +is a client/server program that measures interprocess +connection latencies. The benchmark times the creation and connection of +an AF_INET (aka TCP/IP) socket to a remote server. Care is take that the +connection time does not include any other overhead, such as the +\fIgethostbyname()\fP or remote port lookups since these add more overhead +than the connection establishment itself. +.LP +.B lat_connect +has three forms of usage: as a server (-s), as a client (lat_connect localhost), +and as a shutdown (lat_connect -S localhost). +.SH OUTPUT +The reported time is in microseconds per connection. +Output format is like so +.sp +.ft CB +TCP/IP connection cost to localhost: 1006 microseconds +.ft +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_connect.tbl b/performance/lmbench3/doc/lat_connect.tbl new file mode 100644 index 0000000..5e6b4c2 --- /dev/null +++ b/performance/lmbench3/doc/lat_connect.tbl @@ -0,0 +1,44 @@ +.KS +.TS +center expand doublebox; +l r. +DEC Alpha 976 +Linux i586 606 +IRIX IP22 470 +SunOS-5.4 sun4d 852 +SunOS-5.4 sun4d 3123 +Sun SC1000 4594 +IRIX64-601 IP26 316 +Linux i586 1155 +IRIX5.3 IP22 349 +IRIX64 IP21 667 +IBM Power2 339 +dgux mc88110 4635 +DEC Alpha 4700 +HP-UX 9000/770 319 +HP-UX 9000/755 384 +HP-UX 9000/735 389 +IRIX64 IP25 716 +IRIX64 IP19 763 +IRIX5.3 IP19 694 +Linux i686 746 +Linux i586 775 +Linux i586 779 +Linux i586 835 +Linux i586 1348 +Linux i486 1439 +DEC Alpha 3047 +FreeBSD/i586 454 +HP-UX 9000/897 765 +FreeBSD/i586 465 +FreeBSD/i586 454 +FreeBSD/i586 397 +IRIX IP19 697 +HP-UX 9000/735 388 +IRIX64 IP19 805 +HP-UX 9000/735 459 +HP-UX 9000/819 585 +HP-UX 9000/735 740 +FreeBSD/i586 481 +.TE +.KE diff --git a/performance/lmbench3/doc/lat_ctx.8 b/performance/lmbench3/doc/lat_ctx.8 new file mode 100644 index 0000000..f5a1c2b --- /dev/null +++ b/performance/lmbench3/doc/lat_ctx.8 @@ -0,0 +1,95 @@ +.\" $Id: lat_ctx.8 1.2 00/10/16 17:13:42+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_CTX 8 "$Date: 00/10/16 17:13:42+02:00 $" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +lat_ctx \- context switching benchmark +.SH SYNOPSIS +.B lat_ctx +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +[ +.I "-s <size_in_kbytes>" +] +.I "#procs" +[ +.I "#procs ..." +] +.SH DESCRIPTION +.B lat_ctx +measures context switching time for any reasonable +number of processes of any reasonable size. +The processes are connected in a ring of Unix pipes. Each process +reads a token from its pipe, possibly does some work, and then writes +the token to the next process. +.LP +Processes may vary in number. Smaller numbers of processes result in +faster context switches. More than 20 processes is not supported. +.LP +Processes may vary in size. A size of zero is the baseline process that +does nothing except pass the token on to the next process. A process size +of greater than zero means that the process does some work before passing +on the token. The work is simulated as the summing up of an array of the +specified size. The summing is an unrolled loop of about a 2.7 thousand +instructions. +.LP +The effect is that both the data and the instruction cache +get polluted by some amount before the token is passed on. The data +cache gets polluted by approximately the process ``size''. The instruction +cache gets polluted by a constant amount, approximately 2.7 +thousand instructions. +.LP +The pollution of the caches results in larger context switching times for +the larger processes. This may be confusing because the benchmark takes +pains to measure only the context switch time, not including the overhead +of doing the work. The subtle point is that the overhead is measured using +hot caches. As the number and size of the processes increases, the caches +are more and more polluted until the set of processes do not fit. The +context switch times go up because a context switch is defined as the switch +time +plus the time it takes to restore all of the process state, including +cache state. This means that the switch includes the time for the cache +misses on larger processes. +.SH OUTPUT +Output format is intended as input to \fBxgraph\fP or some similar program. +The format is multi line, the first line is a title that specifies the +size and non-context switching overhead of the test. Each subsequent +line is a pair of numbers that indicates the number of processes and +the cost of a context switch. The overhead and the context switch times are +in micro second units. The numbers below are for a SPARCstation 2. +.sp +.ft CB +.nf +"size=0 ovr=179 +2 71 +4 104 +8 134 +16 333 +20 438 +.br +.fi +.ft +.SH BUGS +The numbers produced by this benchmark are somewhat inaccurate; they vary +by about 10 to 15% from run to run. A series of runs may be done and the +lowest numbers reported. The lower the number the more accurate the results. +.LP +The reasons for the inaccuracies are possibly interaction between the +VM system and the processor caches. It is possible that sometimes the +benchmark processes are laid out in memory such that there are fewer +TLB/cache conflicts than other times. This is pure speculation on our part. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. + diff --git a/performance/lmbench3/doc/lat_disk.tbl b/performance/lmbench3/doc/lat_disk.tbl new file mode 100644 index 0000000..a39a6aa --- /dev/null +++ b/performance/lmbench3/doc/lat_disk.tbl @@ -0,0 +1,23 @@ +.KS +.TS +center expand doublebox; +l r. +SunOS-5.4 sun4m 2876 +Sun SC1000 1466 +DEC Alpha 1436 +DEC Alpha 1995 +IRIX IP22 984 +Sun Ultra1 2242 +HP-UX 9000/770 732 +IRIX IP19 920 +IRIX5.3 IP22 1265 +IRIX5.3 IP19 991 +DEC Alpha 2057 +DEC Alpha 3729 +FreeBSD/i586 297 +FreeBSD/i586 306 +FreeBSD/i586 2314 +FreeBSD/i586 2284 +FreeBSD/i586 310 +.TE +.KE diff --git a/performance/lmbench3/doc/lat_fcntl.8 b/performance/lmbench3/doc/lat_fcntl.8 new file mode 100644 index 0000000..cf3c93e --- /dev/null +++ b/performance/lmbench3/doc/lat_fcntl.8 @@ -0,0 +1,32 @@ +.\" $Id$ +.TH LAT_FCNTL 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +lat_fcntl \- fcntl file locking benchmark +.SH SYNOPSIS +.B lat_ctx +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B lat_fcntl +is a client/server program that measures file locking latencies. The +benchmark alternately locks and unlocks files so that only one of the +client or server is running at a time, similar to ``hot potato'' +message passing benchmarks. +No other work is done in the processes. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8), lat_fifo(8), lat_tcp(8), lat_udp(8), lat_unix(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. + diff --git a/performance/lmbench3/doc/lat_fifo.8 b/performance/lmbench3/doc/lat_fifo.8 new file mode 100644 index 0000000..65e5a08 --- /dev/null +++ b/performance/lmbench3/doc/lat_fifo.8 @@ -0,0 +1,32 @@ +.\" $Id$ +.TH LAT_FIFO 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +lat_fifo \- FIFO benchmark +.SH SYNOPSIS +.B lat_ctx +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B lat_fifo +is a client/server program that measures interprocess +communication latencies. The benchmark passes a message back and forth between +the two processes (this sort of benchmark is frequently referred to as a +``hot potato'' benchmark). No other work is done in the processes. +The message is passed back and forth using FIFOs. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8), lat_fcntl(8), lat_tcp(8), lat_udp(8), lat_unix(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. + diff --git a/performance/lmbench3/doc/lat_fs.8 b/performance/lmbench3/doc/lat_fs.8 new file mode 100644 index 0000000..51afc83 --- /dev/null +++ b/performance/lmbench3/doc/lat_fs.8 @@ -0,0 +1,37 @@ +.\" $Id: lat_fs.8 1.4 94/11/25 16:33:19-08:00 lm@xxxxxxxxxxxxxxx $ +.TH LAT_FS 8 "$Date: 94/11/25 16:33:19-08:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_fs \- measure file system create/delete performance +.SH SYNOPSIS +.B lat_fs +[ +.I dir +] +.SH DESCRIPTION +.B lat_fs +is a program that creates a number of small files in the current working +directory and then removes the files. Both the creation and removal of +the files is timed. +.SH OPTIONS +If +.I dir +is specified, +.B lat_fs +will change to that directory first and do the creates and deletes there. +Otherwise the creates and deletes are done in $PWD. +.SH OUTPUT +The results are in terms of creates per second and deletes per second +as a function of file size. The output is in 4 column form and is the +size of the file, the number created, the creations per second, and the +removals per second. Output format looks like: +.sp +.ft CB +.nf +0k 500 1304 2740 +1k 500 904 1663 +4k 500 861 1647 +10k 500 674 1516 +.fi +.ft +.SH "SEE ALSO" +lmbench(8). diff --git a/performance/lmbench3/doc/lat_fs.tbl b/performance/lmbench3/doc/lat_fs.tbl new file mode 100644 index 0000000..b73a9d7 --- /dev/null +++ b/performance/lmbench3/doc/lat_fs.tbl @@ -0,0 +1,56 @@ +.KS +.TS +expand doublebox; +l c c +l r r. +System Create \fBDelete\fP += +Linux i586 1.4 0.1 +IRIX64-601 IP26 0.9 0.1 +Linux i586 1.5 0.1 +Linux i586 1.1 0.1 +Linux i586 1.4 0.1 +Linux i686 1.2 0.1 +SunOS-5.4 sun4d 0.7 0.4 +SunOS-5.4 sun4d 18.2 8.3 +Linux i586 1.4 0.1 +Linux i486 0.8 0.1 +Linux i486 0.8 0.1 +Linux i586 2.7 0.2 +Sun SC1000 3.7 1.3 +Linux alpha 4.3 4.2 +DEC Alpha 25.0 11.4 +DEC Alpha 25.0 11.1 +DEC Alpha 0.8 0.3 +DEC Alpha 1.3 0.5 +DEC Alpha 38.5 12.3 +DEC Alpha 33.3 11.9 +DEC Alpha 23.3 11.5 +IRIX64 IP25 3.5 4.0 +IRIX64 IP19 3.1 5.0 +IRIX IP22 13.3 8.4 +Linux alpha 25.0 11.5 +DEC Alpha 25.6 14.1 +dgux mc88110 2.4 0.5 +HP-UX 9000/735 2.8 3.9 +FreeBSD/i586 20.0 8.3 +FreeBSD/i586 20.4 8.3 +FreeBSD/i586 22.7 8.3 +FreeBSD/i586 22.7 8.3 +FreeBSD/i586 19.6 8.3 +IRIX IP19 12.0 11.8 +IRIX5.3 IP19 11.5 11.2 +IBM Power2 13.3 12.8 +IRIX5.3 IP22 9.4 8.5 +HP-UX 9000/735 28.6 11.5 +IRIX64 IP21 11.9 11.5 +IBM PowerPC 12.7 12.7 +HP-UX 9000/770 20.0 11.1 +HP-UX 9000/735 15.4 11.1 +HP-UX 9000/819 3.7 11.8 +HP-UX 9000/897 58.8 17.2 +HP-UX 9000/755 26.3 11.2 +IRIX64 IP19 12.5 9.8 +HP-UX 9000/735 26.3 12.0 +.TE +.KE diff --git a/performance/lmbench3/doc/lat_http.8 b/performance/lmbench3/doc/lat_http.8 new file mode 100644 index 0000000..a4bb459 --- /dev/null +++ b/performance/lmbench3/doc/lat_http.8 @@ -0,0 +1,41 @@ +.\" $Id$ +.TH LAT_FCNTL 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +lat_fcntl \- fcntl file locking benchmark +.SH SYNOPSIS +.B lat_ctx +[ +.I "-d" +] +[ +.I "-e" +] +[ +.I "-S" +] +.I serverhost +[ +.I port +] +.SH DESCRIPTION +.B lat_http +is a client/server program that measures simple http transaction +latencies. It has its own HTTP server, and it is meant to simply +measure the minimum overall costs of simple HTTP ``GET'' +transactions. It does not measure the performance of third-party HTTP +servers. +.LP +The client simply makes a series of HTTP GET requests for files. The +files are a fixed set of files included with the benchmark. No +special care was made to ensure that the file sizes match and +predetermined distribution. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8), lat_connect(8), lat_tcp(8), lat_sig(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. + diff --git a/performance/lmbench3/doc/lat_ipc.tbl b/performance/lmbench3/doc/lat_ipc.tbl new file mode 100644 index 0000000..d8a7069 --- /dev/null +++ b/performance/lmbench3/doc/lat_ipc.tbl @@ -0,0 +1,16 @@ +.KS +.TS +expand doublebox; +l l c c c +l l r r r. +System Network \fBTCP bw\fP TCP latency UDP latency += +IRIX IP21 hippi 62 1068 1099 +SunOS-5.5 sun4u@167 100baseT 9.5 280 308 +HP-UX 9000/735 fddi 8.8 425 441 +IRIX IP22 10baseT .9 543 602 +IRIX IP21 10baseT .9 1463 1376 +HP-UX 9000/735 10baseT .9 592 603 +Linux 10baseT .7 2954 1912 +.TE +.KE diff --git a/performance/lmbench3/doc/lat_mem_rd.8 b/performance/lmbench3/doc/lat_mem_rd.8 new file mode 100644 index 0000000..5f8509f --- /dev/null +++ b/performance/lmbench3/doc/lat_mem_rd.8 @@ -0,0 +1,97 @@ +.\" $Id: lat_mem_rd.8 1.4 00/10/16 17:13:43+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_MEM_RD 8 "$Date: 00/10/16 17:13:43+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_mem_rd \- memory read latency benchmark +.SH SYNOPSIS +.B lat_mem_rd +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I "size_in_megabytes" +.I "stride" +[ +.I "stride stride..." +] +.SH DESCRIPTION +.B lat_mem_rd +measures memory read latency for varying memory sizes and strides. The +results are reported in nanoseconds per load and have been verified +accurate to within a few nanoseconds on an SGI Indy. +.LP +The +entire memory hierarchy is measured, including onboard cache latency +and size, external cache latency and size, main memory latency, and TLB +miss latency. +.LP +Only data accesses are measured; the instruction cache is not measured. +.LP +The benchmark runs as two nested loops. The outer loop is the stride size. +The inner loop is the array size. For each array size, the benchmark +creates a ring of pointers that point backward one stride. Traversing the +array is done by +.sp +.ft CB + p = (char **)*p; +.ft +.sp +in a for loop (the over head of the for loop is not significant; the loop is +an unrolled loop 100 loads long). +.LP +The size of the array varies from 512 bytes to (typically) eight megabytes. +For the small sizes, the cache will have an effect, and the loads will be +much faster. This becomes much more apparent when the data is plotted. +.LP +Since this benchmark uses fixed-stride offsets in the pointer chain, +it may be vulnerable to smart, stride-sensitive cache prefetching +policies. Older machines were typically able to prefetch for +sequential access patterns, and some were able to prefetch for strided +forward access patterns, but only a few could prefetch for backward +strided patterns. These capabilities are becoming more widespread +in newer processors. +.SH OUTPUT +Output format is intended as input to \fBxgraph\fP or some similar program +(we use a perl script that produces pic input). +There is a set of data produced for each stride. The data set title +is the stride size and the data points are the array size in megabytes +(floating point value) and the load latency over all points in that array. +.SH "INTERPRETING THE OUTPUT" +The output is best examined in a graph where you typically get a graph +that has four plateaus. The graph should plotted in log base 2 of the +array size on the X axis and the latency on the Y axis. Each stride +is then plotted as a curve. The plateaus that appear correspond to +the onboard cache (if present), external cache (if present), main +memory latency, and TLB miss latency. +.LP +As a rough guide, you may be able to extract the latencies of the +various parts as follows, but you should really look at the graphs, +since these rules of thumb do not always work (some systems do not +have onboard cache, for example). +.IP "onboard cache" 16 +Try stride of 128 and array size of .00098. +.IP "external cache" +Try stride of 128 and array size of .125. +.IP "main memory" +Try stride of 128 and array size of 8. +.IP "TLB miss" +Try the largest stride and the largest array. +.SH BUGS +This program is dependent on the correct operation of +.BR mhz (8). +If you are getting numbers that seem off, check that +.BR mhz (8) +is giving you a clock rate that you believe. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8), tlb(8), cache(8), line(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_mmap.8 b/performance/lmbench3/doc/lat_mmap.8 new file mode 100644 index 0000000..b4a9f2f --- /dev/null +++ b/performance/lmbench3/doc/lat_mmap.8 @@ -0,0 +1,45 @@ +.\" $Id: lat_mmap.8 1.2 00/10/16 17:13:44+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_MMAP 8 "$Date: 00/10/16 17:13:44+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_mmap \- costs of mmapping and unmmapping varying file sizes +.SH SYNOPSIS +.B lat_mmap +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I size +.I file +.SH DESCRIPTION +.B lat_mmap +times how fast a mapping can be made and unmade. This is useful because it +is a fundemental part of processes that use SunOS style shared libraries +(the libraries are mapped in at process start up time and unmapped at +process exit). +.LP +The benchmark maps in and unmaps the first \fIsize\fP bytes of the file +repeatedly and reports the average time for one mapping/unmapping. +.LP +The size +specification may end with ``k'' or ``m'' to mean +kilobytes (* 1024) or megabytes (* 1024 * 1024). +.SH OUTPUT +Output format is \f(CB"%0.2f %d\\n", megabytes, usecs\fP, i.e., +.sp +.ft CB +8.00 1200 +.ft +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_nullsys.tbl b/performance/lmbench3/doc/lat_nullsys.tbl new file mode 100644 index 0000000..1adf112 --- /dev/null +++ b/performance/lmbench3/doc/lat_nullsys.tbl @@ -0,0 +1,58 @@ +.KS +.TS +center expand doublebox; +l r. +SunOS-5.4 sun4m 7 +Sun SC1000 9 +SunOS-5.4 sun4d 12 +SunOS-5.4 sun4m 9 +SunOS-5.4 sun4m 13 +Linux alpha 2 +Linux i586 2 +Linux i586 2 +Unixware/i686 5 +Sun Ultra1 5 +DEC Alpha 9 +Linux i586 3 +Linux i586 3 +Linux alpha 3 +DEC Alpha 11 +DEC Alpha 12 +DEC Alpha 15 +IBM PowerPC 12 +DEC Alpha 17 +FreeBSD/i586 7 +FreeBSD/i586 9 +FreeBSD/i586 10 +DEC Alpha 17 +FreeBSD/i586 7 +SunOS-5.4 sun4d 26 +Linux i686 4 +Linux i586 5 +Linux i586 5 +Linux i486 6 +Linux i486 6 +DEC Alpha 9 +DEC Alpha 13 +HP-UX 9000/735 12 +HP-UX 9000/735 13 +HP-UX 9000/735 14 +IRIX5.3 IP19 20 +HP-UX 9000/755 14 +HP-UX 9000/819 19 +IRIX64 IP25 23 +IRIX IP22 10 +IRIX IP19 16 +IRIX64 IP19 18 +IRIX64 IP19 24 +FreeBSD/i586 9 +HP-UX 9000/770 11 +HP-UX 9000/897 92 +HP-UX 9000/735 12 +dgux mc88110 75 +IBM Power2 16 +IRIX64-601 IP26 20 +IRIX64 IP21 25 +IRIX5.3 IP22 11 +.TE +.KE diff --git a/performance/lmbench3/doc/lat_ops.8 b/performance/lmbench3/doc/lat_ops.8 new file mode 100644 index 0000000..87c6e8e --- /dev/null +++ b/performance/lmbench3/doc/lat_ops.8 @@ -0,0 +1,37 @@ +.\" $Id$ +.TH LAT_OPS 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +lat_ops \- basic CPU operation parallelism +.SH SYNOPSIS +.B lat_ops +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B lat_ops +measures the latency of basic CPU operations, such as +integer ADD. +.TP +integer bit, add, mul, div, mod operations +maximum parallelism for integer XOR, ADD, MUL, DIV, MOD operations. +.TP +uint64 bit, add, mul, div, mod operations +maximum parallelism for uint64 XOR, ADD, MUL, DIV, MOD operations. +.TP +float add, mul, div operations +maximum parallelism for flot ADD, MUL, DIV operations. +.TP +double add, mul, div operations +maximum parallelism for flot ADD, MUL, DIV operations. +.SH BUGS +This benchmark is highly experimental and may sometimes (frequently?) +give erroneous results. +.SH "SEE ALSO" +lmbench(8), par_ops(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_pagefault.8 b/performance/lmbench3/doc/lat_pagefault.8 new file mode 100644 index 0000000..e1cd958 --- /dev/null +++ b/performance/lmbench3/doc/lat_pagefault.8 @@ -0,0 +1,46 @@ +.\" $Id: lat_pagefault.8 1.2 00/10/16 17:13:45+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_PAGEFAULT 8 "$Date: 00/10/16 17:13:45+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_pagefault \- measure the cost of pagefaulting pages from a file +.SH SYNOPSIS +.B lat_pagefault +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I file +[ +.I file.... +] +.SH DESCRIPTION +.B lat_pagefault +times how fast a page of a file can be faulted in. The file is flushed from +(local) memory by using the \f(CBmsync()\fP interface with the invalidate +flag set. (Note that NFS does not send this over the wire so this makes +for a handy way to measure the cost of going across the wire.) +.LP +The benchmark maps in the entire file and the access pages backwards using +a stride of 256K kilobytes. +.SH OUTPUT +Output format is below; it prints the average cost of page faulting a page. +.sp +.ft CB +Pagefaults on <file>: <d> usecs +.ft +.SH BUGS +Using a stride of 256K may be a bad idea because SCSI controllers +may have caches bigger than that. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_pipe.8 b/performance/lmbench3/doc/lat_pipe.8 new file mode 100644 index 0000000..1dff34e --- /dev/null +++ b/performance/lmbench3/doc/lat_pipe.8 @@ -0,0 +1,38 @@ +.\" $Id: lat_pipe.8 1.2 00/10/16 17:13:45+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_PIPE 8 "$Date: 00/10/16 17:13:45+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_pipe \- measure interprocess communication latency through pipes +.SH SYNOPSIS +.B lat_pipe +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B lat_pipe +uses two processes communicating through a Unix pipe to measure interprocess +communication latencies. The benchmark passes a token back and forth between +the two processes (this sort of benchmark is frequently referred to as a +``hot potato'' benchmark). No other work is done in the processes. +.SH OUTPUT +The reported time is in microseconds per round trip and includes the total +time, i.e., the context switching overhead is includeded. +Output format is like so +.sp +.ft CB +Pipe latency: 491 microseconds +.ft +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_pipe.tbl b/performance/lmbench3/doc/lat_pipe.tbl new file mode 100644 index 0000000..5c34872 --- /dev/null +++ b/performance/lmbench3/doc/lat_pipe.tbl @@ -0,0 +1,58 @@ +.KS +.TS +center expand doublebox; +l r. +SunOS-5.4 sun4m 194 +SunOS-5.4 sun4m 150 +DEC Alpha 141 +Linux alpha 34 +Linux i486 56 +Linux i486 56 +Unixware/i686 86 +Linux i586 33 +Sun Ultra1 62 +SunOS-5.4 sun4m 372 +Linux alpha 34 +DEC Alpha 162 +DEC Alpha 191 +Linux i586 42 +DEC Alpha 71 +DEC Alpha 179 +Sun SC1000 278 +IBM PowerPC 65 +dgux mc88110 474 +SunOS-5.4 sun4d 519 +FreeBSD/i586 104 +FreeBSD/i586 111 +FreeBSD/i586 115 +SunOS-5.4 sun4d 671 +Linux i586 84 +Linux i686 31 +Linux i586 43 +Linux i586 43 +Linux i586 140 +DEC Alpha 185 +DEC Alpha 198 +DEC Alpha 278 +HP-UX 9000/755 193 +HP-UX 9000/897 118 +IRIX64 IP19 187 +HP-UX 9000/770 148 +HP-UX 9000/819 113 +HP-UX 9000/735 181 +FreeBSD/i586 115 +IRIX IP22 118 +HP-UX 9000/735 178 +HP-UX 9000/735 169 +HP-UX 9000/735 172 +IRIX64 IP21 264 +IRIX5.3 IP19 366 +IBM Power2 91 +IRIX64 IP25 230 +IRIX64-601 IP26 222 +IRIX64 IP19 251 +IRIX IP19 333 +FreeBSD/i586 127 +IRIX5.3 IP22 131 +.TE +.KE diff --git a/performance/lmbench3/doc/lat_proc.8 b/performance/lmbench3/doc/lat_proc.8 new file mode 100644 index 0000000..51c8e69 --- /dev/null +++ b/performance/lmbench3/doc/lat_proc.8 @@ -0,0 +1,58 @@ +.\" $Id: lat_proc.8 1.2 00/10/16 17:13:46+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_PROC 8 "$Date: 00/10/16 17:13:46+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_proc \- process creation tests +.SH SYNOPSIS +.B lat_proc +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I "procedure|fork|exec|shell" +.SH DESCRIPTION +.B lat_proc +creates processes in three different forms, each more expensive than the last. +The purposes is to measure the time that it takes to create a basic thread +of control. +.LP +The forms are listed and described below: +.TP 20 +Process fork+exit +The time it takes to split a process into two (nearly) identical copies +and have one exit. This is how new processes are created but is not +very useful since both processes are doing the same thing. +.TP +Process fork+execve +The time it takes to create a new process and have that new process run a new +program. This is the inner loop of all shells (command interpreters). +.TP +Process fork+/bin/sh -c +The time it takes to create a new process and have that new process run a new +program by asking the system shell to find that program and run it. This is +how the C library interface called \f(CBsystem\fP is implemented. It is the +most general and the most expensive. +.SH OUTPUT +Output is in microseconds per operation like so: +.sp +.ft CB +.nf +Process fork+exit: 6054 microseconds +Process fork+execve: 11212 microseconds +Process fork+/bin/sh -c: 44346 microseconds +.br +.fi +.ft +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_rpc.8 b/performance/lmbench3/doc/lat_rpc.8 new file mode 100644 index 0000000..12680da --- /dev/null +++ b/performance/lmbench3/doc/lat_rpc.8 @@ -0,0 +1,68 @@ +.\" $Id: lat_rpc.8 1.2 00/10/16 17:13:47+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_RPC 8 "$Date: 00/10/16 17:13:47+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_rpc \- measure interprocess communication latency via Sun RPC +.SH SYNOPSIS +.B lat_rpc +.I -s +.sp .5 +.B lat_rpc +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +[ +.I "-p tcp|udp" +] +.I hostname +[ +.I "udp|tcp" +] +.sp .5 +.B lat_rpc +.I "-S hostname" +.SH DESCRIPTION +.B lat_rpc +is a client/server program that measures interprocess +communication latencies. The benchmark passes a token back and forth between +the two processes (this sort of benchmark is frequently referred to as a +``hot potato'' benchmark). No other work is done in the processes. +.LP +This benchmark may be compared to the TCP and UDP forms of the same benchmark +to accurately see the cost of using RPC versus the cost of using plain +old TCP or UDP sockets. It is worth noting that the RPC form is passing +back and forth a single byte, not some long complicated record. +.LP +.B lat_rpc +has three forms of usage: as a server (-s), as a client (lat_rpc localhost), and +as a shutdown (lat_rpc -S localhost). +.LP +The client form may specify the protocol over which the RPCs are performed. +The default is to measure performance for both +.I udp +and +.IR tcp . +.SH OUTPUT +The reported time is in microseconds per round trip and includes the total +time, i.e., the context switching overhead is includeded. +Output format is like so +.sp +.ft CB +RPC/udp latency using localhost: 1344 microseconds +.br +RPC/tcp latency using localhost: 2089 microseconds +.ft +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_select.8 b/performance/lmbench3/doc/lat_select.8 new file mode 100644 index 0000000..03f83bf --- /dev/null +++ b/performance/lmbench3/doc/lat_select.8 @@ -0,0 +1,33 @@ +.\" $Id$ +.TH LAT_SELECT 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +lat_select \- select benchmark +.SH SYNOPSIS +.B lat_ctx +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +[ +.I "n" +] +.SH DESCRIPTION +.B lat_select +measures the time to do a select on +.I n +file descriptors. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. + diff --git a/performance/lmbench3/doc/lat_sig.8 b/performance/lmbench3/doc/lat_sig.8 new file mode 100644 index 0000000..91baf78 --- /dev/null +++ b/performance/lmbench3/doc/lat_sig.8 @@ -0,0 +1,33 @@ +.\" $Id$ +.TH LAT_SIG 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +lat_sig \- select benchmark +.SH SYNOPSIS +.B lat_ctx +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I "install|catch|prot" +[ +.I "file" +] +.SH DESCRIPTION +.B lat_sig +measures the time to install and catch signals. It can also measure +the time to catch a protection fault. +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. + diff --git a/performance/lmbench3/doc/lat_signal.tbl b/performance/lmbench3/doc/lat_signal.tbl new file mode 100644 index 0000000..deac19b --- /dev/null +++ b/performance/lmbench3/doc/lat_signal.tbl @@ -0,0 +1,48 @@ +.KS +.TS +expand doublebox; +l c c +l r r. +System sigaction \fBsig handler\fP += +DEC Alpha 20 30 +IRIX5.3 IP22 5 9 +IRIX IP22 10 12 +IRIX64-601 IP26 11 10 +Linux i586 11 22 +Linux i586 12 22 +DEC Alpha 5 101 +Linux alpha 13 38 +Linux i486 6 45 +Linux alpha 18 37 +Linux i586 9 25 +Linux i586 8 50 +dgux mc88110 5 16 +FreeBSD/i586 4 16 +FreeBSD/i586 10 34 +Linux i486 7 52 +FreeBSD/i586 9 34 +DEC Alpha 6 138 +IRIX64 IP19 6 9 +IRIX5.3 IP19 4 8 +IRIX64 IP21 5 13 +Linux i686 4 14 +Linux i586 4 23 +Linux i586 6 23 +HP-UX 9000/897 10 38 +IRIX64 IP19 4 35 +HP-UX 9000/770 10 37 +HP-UX 9000/819 11 54 +HP-UX 9000/755 10 52 +HP-UX 9000/735 10 38 +HP-UX 9000/735 6 32 +IRIX IP19 6 79 +HP-UX 9000/735 5 55 +IRIX64 IP25 5 55 +IBM PowerPC 5 19 +FreeBSD/i586 13 56 +IBM Power2 52 355 +HP-UX 9000/735 15 47 +FreeBSD/i586 18 52 +.TE +.KE diff --git a/performance/lmbench3/doc/lat_syscall.8 b/performance/lmbench3/doc/lat_syscall.8 new file mode 100644 index 0000000..61b0ada --- /dev/null +++ b/performance/lmbench3/doc/lat_syscall.8 @@ -0,0 +1,70 @@ +.\" $Id: lat_syscall.8 1.2 00/10/16 17:13:48+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_SYSCALL 8 "$Date: 00/10/16 17:13:48+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_syscall - time simple entry into the operating system +.SH SYNOPSIS +.B lat_syscall +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I "null|read|write|stat|fstat|open" +[ +.I file +] +.SH DESCRIPTION +.TP +null +measures how long it takes to do +.IR getppid (). +We chose +.IR getppid () +because in all UNIX variants we are aware of, it requires a round-trip +to/from kernel space and the actual work required inside the kernel is +small and bounded. +.TP +read +measures how long it takes to read one byte from \f(CB/dev/zero\fP. +Note that some operating systems do not support \f(CB/dev/zero\fP. +.TP +write +times how long it takes to write one byte to \f(CB/dev/null\fP. This +is useful as a lower bound cost on anything that has to interact with +the operating system. +.TP +stat +measures how long it takes to +.IR stat () +a file whose inode is already cached. +.TP +fstat +measures how long it takes to +.IR fstat () +an open file whose inode is already cached. +.TP +open +measures how long it takes to +.IR open () +and then +.IR close() +a file. +.SH OUTPUT +Output format is +.sp +.ft CB +Null syscall: 67 microseconds +.ft +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_tcp.8 b/performance/lmbench3/doc/lat_tcp.8 new file mode 100644 index 0000000..c945460 --- /dev/null +++ b/performance/lmbench3/doc/lat_tcp.8 @@ -0,0 +1,52 @@ +.\" $Id: lat_tcp.8 1.2 00/10/16 17:13:49+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_TCP 8 "$Date: 00/10/16 17:13:49+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_tcp \- measure interprocess communication latency via TCP/IP +.SH SYNOPSIS +.B lat_tcp +.I -s +.sp .5 +.B lat_tcp +[ +.I "-m <message size>" +] +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I hostname +.sp .5 +.B lat_tcp +.I "-S hostname" +.SH DESCRIPTION +.B lat_tcp +is a client/server program that measures interprocess +communication latencies. The benchmark passes a message back and forth between +the two processes (this sort of benchmark is frequently referred to as a +``hot potato'' benchmark). No other work is done in the processes. +.LP +.B lat_tcp +has three forms of usage: as a server (-s), as a client (lat_tcp localhost), and +as a shutdown (lat_tcp -S localhost). +.SH OUTPUT +The reported time is in microseconds per round trip and includes the total +time, i.e., the context switching overhead is includeded. +Output format is like so +.sp +.ft CB +TCP latency using localhost: 700 microseconds +.ft +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8), lat_fcntl(8), lat_fifo(8), lat_tcp(8), lat_udp(8), lat_unix(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_tcp.tbl b/performance/lmbench3/doc/lat_tcp.tbl new file mode 100644 index 0000000..3ec3abb --- /dev/null +++ b/performance/lmbench3/doc/lat_tcp.tbl @@ -0,0 +1,59 @@ +.KS +.TS +expand doublebox; +l c c +l r r. +System TCP \fBRPC/TCP\fP += +DEC Alpha 485 788 +DEC Alpha 581 822 +Linux alpha 419 617 +DEC Alpha 629 994 +DEC Alpha 428 851 +DEC Alpha 267 371 +DEC Alpha 526 872 +DEC Alpha 412 673 +Linux i686 263 427 +Sun SC1000 855 1386 +DEC Alpha 826 1451 +Sun Ultra1 162 346 +Linux alpha 429 602 +Linux i586 1149 1434 +SunOS-5.4 sun4m 560 1196 +SunOS-5.4 sun4d 1006 1584 +SunOS-5.4 sun4m 826 1631 +SunOS-5.4 sun4m 335 784 +SunOS-5.4 sun4d 1211 1847 +Linux i586 467 713 +Linux i486 1592 2147 +FreeBSD/i586 264 450 +FreeBSD/i586 297 510 +IRIX5.3 IP22 278 641 +IRIX64-601 IP26 467 1018 +IRIX IP22 279 580 +Linux i586 477 718 +Linux i586 1196 1506 +Linux i586 1291 1668 +Linux i486 1465 2078 +IBM PowerPC 299 698 +FreeBSD/i586 312 548 +HP-UX 9000/735 222 707 +FreeBSD/i586 290 532 +HP-UX 9000/770 186 712 +FreeBSD/i586 295 535 +HP-UX 9000/819 393 668 +HP-UX 9000/735 257 805 +HP-UX 9000/755 262 812 +HP-UX 9000/735 245 800 +HP-UX 9000/897 286 854 +dgux mc88110 1381 1851 +IBM Power2 332 649 +IRIX64 IP25 482 806 +IRIX IP19 766 913 +IRIX64 IP21 643 974 +IRIX64 IP19 886 957 +HP-UX 9000/735 248 820 +IRIX64 IP19 546 900 +IRIX5.3 IP19 815 1006 +.TE +.KE diff --git a/performance/lmbench3/doc/lat_udp.8 b/performance/lmbench3/doc/lat_udp.8 new file mode 100644 index 0000000..1545e3f --- /dev/null +++ b/performance/lmbench3/doc/lat_udp.8 @@ -0,0 +1,52 @@ +.\" $Id: lat_udp.8 1.2 00/10/16 17:13:50+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LAT_UDP 8 "$Date: 00/10/16 17:13:50+02:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_udp \- measure interprocess communication latency via UDP/IP +.SH SYNOPSIS +.B lat_udp +.I -s +.sp .5 +.B lat_udp +[ +.I "-m <message size>" +] +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.I hostname +.sp .5 +.B lat_udp +.I "-S hostname" +.SH DESCRIPTION +.B lat_udp +is a client/server program that measures interprocess +communication latencies. The benchmark passes a message back and forth between +the two processes (this sort of benchmark is frequently referred to as a +``hot potato'' benchmark). No other work is done in the processes. +.LP +.B lat_udp +has three forms of usage: as a server (-s), as a client (lat_udp localhost), and +as a shutdown (lat_udp -S localhost). +.SH OUTPUT +The reported time is in microseconds per round trip and includes the total +time, i.e., the context switching overhead is included. +Output format is like so +.sp +.ft CB +UDP latency using localhost: 650 microseconds +.ft +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8), lat_fcntl(8), lat_fifo(8), lat_tcp(8), lat_unix(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_udp.tbl b/performance/lmbench3/doc/lat_udp.tbl new file mode 100644 index 0000000..bf0ff9c --- /dev/null +++ b/performance/lmbench3/doc/lat_udp.tbl @@ -0,0 +1,56 @@ +.KS +.TS +expand doublebox; +l c c +l r r. +System UDP \fBRPC/UDP\fP += +DEC Alpha 404 718 +Linux alpha 180 317 +Linux alpha 199 330 +DEC Alpha 259 358 +Linux i686 112 217 +Linux i486 368 770 +Linux i586 187 366 +Linux i586 276 538 +DEC Alpha 379 717 +DEC Alpha 676 765 +DEC Alpha 489 834 +Sun Ultra1 197 267 +Linux i586 281 552 +Linux i586 272 553 +SunOS-5.4 sun4m 414 622 +SunOS-5.4 sun4m 914 1290 +DEC Alpha 569 836 +Sun SC1000 739 1101 +SunOS-5.4 sun4m 590 935 +FreeBSD/i586 213 387 +FreeBSD/i586 249 408 +HP-UX 9000/819 413 655 +IRIX5.3 IP22 313 671 +IRIX64-601 IP26 474 1008 +IRIX IP22 261 562 +Linux i486 351 831 +DEC Alpha 709 1109 +SunOS-5.4 sun4d 1084 1430 +SunOS-5.4 sun4d 1180 1562 +IRIX IP19 796 903 +FreeBSD/i586 240 420 +IBM Power2 254 531 +IBM PowerPC 206 536 +FreeBSD/i586 265 459 +IRIX64 IP21 660 783 +dgux mc88110 1373 2175 +HP-UX 9000/897 289 673 +HP-UX 9000/770 185 657 +HP-UX 9000/735 244 742 +IRIX5.3 IP19 785 960 +IRIX64 IP25 486 740 +HP-UX 9000/735 248 759 +HP-UX 9000/735 246 768 +HP-UX 9000/735 252 786 +IRIX64 IP19 814 964 +HP-UX 9000/755 244 832 +IRIX64 IP19 678 893 +.TE +.KE diff --git a/performance/lmbench3/doc/lat_unix.8 b/performance/lmbench3/doc/lat_unix.8 new file mode 100644 index 0000000..2117b3f --- /dev/null +++ b/performance/lmbench3/doc/lat_unix.8 @@ -0,0 +1,41 @@ +.\" $Id$ +.TH LAT_UNIX 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +lat_unix \- measure interprocess communication latency via UNIX sockets +.SH SYNOPSIS +.B lat_unix +[ +.I "-m <message size>" +] +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B lat_unix +is a client/server program that measures interprocess +communication latencies. The benchmark passes a message back and forth between +the two processes (this sort of benchmark is frequently referred to as a +``hot potato'' benchmark). No other work is done in the processes. +.SH OUTPUT +The reported time is in microseconds per round trip and includes the total +time, i.e., the context switching overhead is includeded. +Output format is like so +.sp +.ft CB +AF_UNIX sock stream latency: 700 microseconds +.ft +.SH ACKNOWLEDGEMENT +Funding for the development of +this tool was provided by Sun Microsystems Computer Corporation. +.SH "SEE ALSO" +lmbench(8), lat_fcntl(8), lat_fifo(8), lat_tcp(8), lat_udp(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lat_unix_connect.8 b/performance/lmbench3/doc/lat_unix_connect.8 new file mode 100644 index 0000000..b42e9a4 --- /dev/null +++ b/performance/lmbench3/doc/lat_unix_connect.8 @@ -0,0 +1,43 @@ +.\" $Id$ +.TH LAT_UNIX_CONNECT 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lat_unix_connect \- measure interprocess connection latency via UNIX sockets +.SH SYNOPSIS +.B lat_unix_connect +.I -s +.sp .5 +.B lat_unix_connect +[ +.I "-P <parallelism>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.sp .5 +.B lat_unix_connect +.I "-S" +.SH DESCRIPTION +.B lat_unix_connect +is a client/server program that measures interprocess +connection latencies. The benchmark times the creation and connection of +an AF_UNIX socket to a local server. +.LP +.B lat_connect +has three forms of usage: as a server (-s), as a client (lat_connect), +and as a shutdown (lat_connect -S). +.SH OUTPUT +The reported time is in microseconds per connection. +Output format is like so +.sp +.ft CB +UNIX connection cost: 1006 microseconds +.ft +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/line.8 b/performance/lmbench3/doc/line.8 new file mode 100644 index 0000000..0e0e043 --- /dev/null +++ b/performance/lmbench3/doc/line.8 @@ -0,0 +1,50 @@ +.\" $Id$ +.TH LINE 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +line \- cache line size +.SH SYNOPSIS +.B tlb +[ +.I "-M <len>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B line +tries to determine the cache line size in bytes of the largest cache +which is smaller than +.I len +bytes. +.LP +.B line +creates pointer chains which access the first word on each cache line +on a page (randomly meandering through all the lines in a page before +jumping to the next page). It measures the average memory latency +for a variety of line sizes, starting with a line size of one word. +When it finds an increase in the average latency that is significantly +larger than the latency for the smaller line size then it assumes that +it has found the line size. +.LP +This algorithm works because for line sizes less than the true line +size, at least two +.B line +cache lines fit in the space of a true cache line. Since that cache +line will be accessed twice, the first access will cause an expensive +cache miss, while the second access will be a cache hit. Once the +.B line +cache line is equal to the true cache line size, then all accesses +will cause cache misses. +.SH BUGS +.B line +is an experimental benchmark, but it seems to work well on most +systems. +.SH "SEE ALSO" +lmbench(8), tlb(8), cache(8), par_mem(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lmbench.3 b/performance/lmbench3/doc/lmbench.3 new file mode 100644 index 0000000..e6db877 --- /dev/null +++ b/performance/lmbench3/doc/lmbench.3 @@ -0,0 +1,344 @@ +.\" +.\" @(#)lmbench.man 3.0 2000/10/12 +.\" +.\" lmbench - benchmarking toolbox +.\" +.\" Copyright (C) 1998-2000 Carl Staelin and Larry McVoy +.\" E-mail: staelin@xxxxxxxxxx +.\" +.TH "LMBENCH" 3 "$Date:$" "(c)1998-2000 Larry McVoy and Carl Staelin" "LMBENCH" +.SH "NAME" +lmbench \- benchmarking toolbox +.SH "SYNOPSIS" +.B "#include ``lmbench.h''" +.LP +.B "typedef u_long iter_t" +.LP +.B "typedef (*benchmp_f)(iter_t iterations, void* cookie)" +.LP +.B "void benchmp(benchmp_f initialize, benchmp_f benchmark, benchmp_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie)" +.LP +.B "uint64 get_n()" +.LP +.B "void milli(char *s, uint64 n)" +.LP +.B "void micro(char *s, uint64 n)" +.LP +.B "void nano(char *s, uint64 n)" +.lP +.B "void mb(uint64 bytes)" +.LP +.B "void kb(uint64 bytes)" +.SH "DESCRIPTION" +Creating benchmarks using the +.I lmbench +timing harness is easy. +Since it is so easy to measure performance using +.I lmbench , +it is possible to quickly answer questions that arise during system +design, development, or tuning. For example, image processing +.LP +There are two attributes that are critical for performance, latency +and bandwidth, and +.I lmbench\'s +timing harness makes it easy to measure and report results for both. +Latency is usually important for frequently executed operations, and +bandwidth is usually important when moving large chunks of data. +.LP +There are a number of factors to consider when building benchmarks. +.LP +The timing harness requires that the benchmarked operation +be idempotent so that it can be repeated indefinitely. +.LP +The timing subsystem, +.BR benchmp , +is passed up to three function pointers. Some benchmarks may +need as few as one function pointer (for +.IR benchmark ). +.TP +.B "void benchmp(initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie)" +measures the performance of +.I benchmark +repeatedly and reports the median result. +.I benchmp +creates +.I parallel +sub-processes which run +.I benchmark +in parallel. This allows lmbench to measure the system's ability to +scale as the number of client processes increases. Each sub-process +executes +.I initialize +before starting the benchmarking cycle with +.I iterations +set to 0. It will call +.I initialize , +.I benchmark , +and +.I cleanup +with +.I iterations +set to the number of iterations in the timing loop +several times in order to collect +.I repetitions +results. The calls to +.I benchmark +are surrounded by +.I start +and +.I stop +call to time the amount of time it takes to do +the benchmarked operation +.I iterations +times. +After all the benchmark results have been collected, +.I cleanup +is called with +.I iterations set to 0 to cleanup any resources which +may have been allocated by +.I initialize +or +.IR benchmark . +.I cookie +is a void pointer to a hunk of memory that can be used to store any +parameters or state that is needed by the benchmark. +.TP +.B "void benchmp_getstate()" +returns a void pointer to the lmbench-internal state used during +benchmarking. The state is not to be used or accessed directly +by clients, but rather would be passed into +.I benchmp_interval. +.TP +.B "iter_t benchmp_interval(void* state)" +returns the number of times the benchmark should execute its +benchmark loop during this timing interval. This is used only +for weird benchmarks which cannot implement the benchmark +body in a function which can return, such as the page fault +handler. Please see +.I lat_sig.c +for sample usage. +.TP +.B "uint64 get_n()" +returns the number of times +.I loop_body +was executed during the timing interval. +.TP +.B "void milli(char *s, uint64 n)" +print out the time per operation in milli-seconds. +.I n +is the number of operations during the timing interval, which is passed +as a parameter because each +.I loop_body +can contain several operations. +.TP +.B "void micro(char *s, uint64 n)" +print the time per opertaion in micro-seconds. +.TP +.B "void nano(char *s, uint64 n)" +print the time per operation in nano-seconds. +.TP +.B "void mb(uint64 bytes)" +print the bandwidth in megabytes per second. +.TP +.B "void kb(uint64 bytes)" +print the bandwidth in kilobytes per second. +.SH "USING lmbench" +Here is an example of a simple benchmark that measures the latency +of the random number generator +.BR lrand48() : +.IP +.B "#include ``lmbench.h''" +.br + +.br +.B void +.br +.B benchmark_lrand48(iter_t iterations, void* cookie) +.B { +.br +.B " while(iterations-- > 0)" +.br +.B " lrand48();" +.br +.B } +.br + +.br +.B int +.br +.B "main(int argc, char *argv[])" +.br +.B { +.br +.B " benchmp(NULL, benchmark_lrand48, NULL, 0, 1, 0, TRIES, NULL);" +.br +.B " micro("lrand48()", get_n());" +.br +.B " exit(0);" +.br +.B } +.br + +.LP +Here is a simple benchmark that measures and reports the bandwidth of +.BR bcopy : +.IP +.B "#include ``lmbench.h''" +.br + +.br +.B "#define MB (1024 * 1024) +.br +.B "#define SIZE (8 * MB)" +.br + +.br +.B "struct _state {" +.br +.B " int size;" +.br +.B " char* a;" +.br +.B " char* b;" +.br +.B "};" +.br + +.br +.B void +.br +.B initialize_bcopy(iter_t iterations, void* cookie) +.B "{" +.br +.B " struct _state* state = (struct _state*)cookie;" +.br + +.br +.B " if (!iterations) return;" +.br +.B " state->a = malloc(state->size);" +.br +.B " state->b = malloc(state->size);" +.br +.B " if (state->a == NULL || state->b == NULL)" +.br +.B " exit(1);" +.br +.B "}" +.br + +.br +.B void +.br +.B benchmark_bcopy(iter_t iterations, void* cookie) +.B "{" +.br +.B " struct _state* state = (struct _state*)cookie;" +.br + +.br +.B " while(iterations-- > 0)" +.br +.B " bcopy(state->a, state->b, state->size);" +.br +.B "}" +.br + +.br +.B void +.br +.B cleanup_bcopy(iter_t iterations, void* cookie) +.B "{" +.br +.B " struct _state* state = (struct _state*)cookie;" +.br + +.br +.B " if (!iterations) return;" +.br +.B " free(state->a);" +.br +.B " free(state->b);" +.br +.B "}" +.br + +.br +.B int +.br +.B "main(int argc, char *argv[])" +.br +.B "{" +.br +.B " struct _state state;" +.br + +.br +.B " state.size = SIZE;" +.br +.B " benchmp(initialize_bcopy, benchmark_bcopy, cleanup_bcopy," +.br +.B " 0, 1, 0, TRIES, &state);" +.br +.B " mb(get_n() * state.size);" +.br +.B " exit(0);" +.br +.B "}" +.br + +.LP +A slightly more complex version of the +.B bcopy +benchmark might measure bandwidth as a function of memory size and +parallelism. The main procedure in this case might look something +like this: +.IP +.B int +.br +.B "main(int argc, char *argv[])" +.br +.B "{" +.br +.B " int size, par;" +.br +.B " struct _state state;" +.br + +.br +.B " for (size = 64; size <= SIZE; size <<= 1) {" +.br +.B " for (par = 1; par < 32; par <<= 1) {" +.br +.B " state.size = size;" +.br +.B " benchmp(initialize_bcopy, benchmark_bcopy," +.br +.B " cleanup_bcopy, 0, par, 0, TRIES, &state);" +.br +.B " fprintf(stderr, \%d\\t%d\\t\", size, par);" +.br +.B " mb(par * get_n() * state.size);" +.br +.B " }" +.br +.B " }" +.br +.B " exit(0);" +.br +.B "}" + +.SH "VARIABLES" +There are three environment variables that can be used to modify the +.I lmbench +timing subsystem: ENOUGH, TIMING_O, and LOOP_O. +.SH "FUTURES" +Development of +.I lmbench +is continuing. +.SH "SEE ALSO" +lmbench(8), timing(3), reporting(3), results(3). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lmbench.8 b/performance/lmbench3/doc/lmbench.8 new file mode 100644 index 0000000..262515d --- /dev/null +++ b/performance/lmbench3/doc/lmbench.8 @@ -0,0 +1,222 @@ +.\" $Id: lmbench.8 1.4 00/10/16 17:13:52+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH LMBENCH 8 "$Date: 00/10/16 17:13:52+02:00 $" "(c)1994-2000 Larry McVoy and Carl Staelin" "LMBENCH" +.SH NAME +lmbench \- system benchmarks +.SH DESCRIPTION +.B lmbench +is a series of micro benchmarks intended to measure basic operating +system and hardware system metrics. The benchmarks fall into three +general classes: bandwidth, latency, and ``other''. +.LP +Most of the +.I lmbench +benchmarks use a standard timing harness described in timing(8) +and have a few standard options: +.IR parallelism , +.IR warmup , +and +.IR repetitions . +.I Parallelism +specifies the number of benchmark processes to run in parallel. +This is primarily useful when measuring the performance of SMP +or distributed computers and can be used to evaluate the system's +performance scalability. +.I Warmup +is the number of minimum number of microseconds the benchmark should +execute the benchmarked capability before it begins measuring +performance. Again this is primarily useful for SMP or distributed +systems and it is intended to give the process scheduler time to +"settle" and migrate processes to other processors. By measuring +performance over various +.I warmup +periods, users may evaulate the scheduler's responsiveness. +.I Repetitions +is the number of measurements that the benchmark should take. This +allows lmbench to provide greater or lesser statistical strength to +the results it reports. The default number of +.I repetitions +is 11. +.SH BANDWIDTH MEASUREMENTS +Data movement is fundemental to the performance on most computer systems. +The bandwidth measurements are intended to show how the system can move +data. The results of the bandwidth metrics can be compared but care +must be taken to understand what it is that is being compared. The +bandwidth benchmarks can be reduced to two main components: operating +system overhead and memory speeds. The bandwidth benchmarks report +their results as megabytes moved per second but please note that the +data moved is \fBnot\fP necessarily the same as the memory bandwidth +used to move the data. Consult the individual man pages for more +information. +.LP +Each of the bandwidth benchmarks is listed below with a brief overview of the +intent of the benchmark. +.TP 14 +bw_file_rd +reading and summing of a file via the read(2) interface. +.TP +bw_mem_cp +memory copy. +.TP +bw_mem_rd +memory reading and summing. +.TP +bw_mem_wr +memory writing. +.TP +bw_mmap_rd +reading and summing of a file via the memory mapping mmap(2) interface. +.TP +bw_pipe +reading of data via a pipe. +.TP +bw_tcp +reading of data via a TCP/IP socket. +.TP +bw_unix +reading data from a UNIX socket. +.SH LATENCY MEASUREMENTS +Control messages are also fundemental to the performance on most +computer systems. The latency measurements are intended to show how fast +a system can be told to do some operation. The results of the +latency metrics can be compared to each other +for the most part. In particular, the +pipe, rpc, tcp, and udp transactions are all identical benchmarks +carried out over different system abstractions. +.LP +Latency numbers here should mostly be in microseconds per operation. +.TP 14 +lat_connect +the time it takes to establish a TCP/IP connection. +.TP +lat_ctx +context switching; the number and size of processes is varied. +.TP +lat_fcntl +fcntl file locking. +.TP +lat_fifo +``hot potato'' transaction through a UNIX FIFO. +.TP +lat_fs +creating and deleting small files. +.TP +lat_pagefault +the time it takes to fault in a page from a file. +.TP +lat_mem_rd +memory read latency (accurate to the ~2-5 nanosecond range, +reported in nanoseconds). +.TP +lat_mmap +time to set up a memory mapping. +.TP +lat_ops +basic processor operations, such as integer XOR, ADD, SUB, MUL, DIV, +and MOD, and float ADD, MUL, DIV, and double ADD, MUL, DIV. +.TP +lat_pipe +``hot potato'' transaction through a Unix pipe. +.TP +lat_proc +process creation times (various sorts). +.TP +lat_rpc +``hot potato'' transaction through Sun RPC over UDP or TCP. +.TP +lat_select +select latency +.TP +lat_sig +signal installation and catch latencies. Also protection fault signal +latency. +.TP +lat_syscall +non trivial entry into the system. +.TP +lat_tcp +``hot potato'' transaction through TCP. +.TP +lat_udp +``hot potato'' transaction through UDP. +.TP +lat_unix +``hot potato'' transaction through UNIX sockets. +.TP +lat_unix_connect +the time it takes to establish a UNIX socket connection. +.SH OTHER MEASUREMENTS +.TP 14 +mhz +processor cycle time +.TP +tlb +TLB size and TLB miss latency +.TP +line +cache line size (in bytes) +.TP +cache +cache statistics, such as line size, cache sizes, memory parallelism. +.TP +stream +John McCalpin's stream benchmark +.TP +par_mem +memory subsystem parallelism. How many requests can the memory +subsystem service in parallel, which may depend on the location of the +data in the memory hierarchy. +.TP +par_ops +basic processor operation parallelism. +.SH SEE ALSO +bargraph(1), +graph(1), +lmbench(3), +results(3), +timing(3), +bw_file_rd(8), +bw_mem_cp(8), +bw_mem_wr(8), +bw_mmap_rd(8), +bw_pipe(8), +bw_tcp(8), +bw_unix(8), +lat_connect(8), +lat_ctx(8), +lat_fcntl(8), +lat_fifo(8), +lat_fs(8), +lat_http(8), +lat_mem_rd(8), +lat_mmap(8), +lat_ops(8), +lat_pagefault(8), +lat_pipe(8), +lat_proc(8), +lat_rpc(8), +lat_select(8), +lat_sig(8), +lat_syscall(8), +lat_tcp(8), +lat_udp(8), +lmdd(8), +par_ops(8), +par_mem(8), +mhz(8), +tlb(8), +line(8), +cache(8), +stream(8) +.SH ACKNOWLEDGEMENT +Funding for the development of these tools was provided by Sun +Microsystems Computer Corporation. +.LP +A large number of people have contributed to the testing and +development of lmbench. +.SH COPYING +The benchmarking code is distributed under the GPL with additional +restrictions, see the COPYING file. +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/lmbench3.ms b/performance/lmbench3/doc/lmbench3.ms new file mode 100755 index 0000000..fa41323 --- /dev/null +++ b/performance/lmbench3/doc/lmbench3.ms @@ -0,0 +1,1853 @@ +.\" This document is GNU groff -mgs -t -p -R -s +.\" It will not print with normal troffs, it uses groff features, in particular, +.\" long names for registers & strings. +.\" Deal with it and use groff - it makes things portable. +.\" +.\" $X$ xroff -mgs -t -p -R -s $file +.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more +.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr +.VARPS +.\" Define a page top that looks cool +.\" HELLO CARL! To turn this off, s/PT/oldPT/ +.de PT +.\" .tl '\fBDRAFT\fP'\\*(DY'\fBDRAFT\fP' +.tl '''' +.. +.de lmPT +.if \\n%>1 \{\ +. sp -.1i +. ps 14 +. ft 3 +. nr big 24 +. nr space \\w'XXX' +. nr titlewid \\w'\\*[title]' +. nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 +. ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' +. ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 +. ce 1 +\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] +. ps +. sp -.70 +. ps 12 +\\l'\\n[LL]u' +. ft +. ps +.\} +.. +.\" Define a page bottom that looks cool +.\" HELLO CARL! To turn this off, s/BT/oldBT/ +.de BT +.tl ''Page %'' +.. +.de lmBT +. ps 9 +\v'-1'\\l'\\n(LLu' +. sp -1 +. tl '\(co 2002 \\*[author]'\\*(DY'%' +. ps +.. +.de SP +. if t .sp .5 +. if n .sp 1 +.. +.de BU +. SP +. ne 2 +\(bu\ +. if \\n[.$] \fB\\$1\fP\\$2 +.. +.nr FIGURE 0 +.nr TABLE 0 +.nr SMALL .25i +.de TSTART +. KF +. if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 +. ps -1 +. vs -1 +.. +.de TEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr TABLE \\n[TABLE]+1 +. ce 1 +\fBTable \\n[TABLE].\ \ \\$1\fP +. SP +. KE +.. +.de FEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr FIGURE \\n[FIGURE]+1 +. ce 1 +\fBFigure \\n[FIGURE].\ \ \\$1\fP +. SP +. KE +.. +.\" Configuration +.nr PI 3n +.nr HM 1i +.nr FM 1i +.nr PO 1i +.if t .po 1i +.nr LL 6.5i +.if n .nr PO 0i +.if n .nr LL 7.5i +.nr PS 10 +.nr VS \n(PS+1 +.ds title Measuring scalability +.ds author Carl Staelin +.ds micro \(*m +.ds lmbench \f(CWlmbench\fP +.ds lmbench1 \f(CWlmbench1\fP +.ds lmbench2 \f(CWlmbench2\fP +.ds lmbench3 \f(CWlmbench3\fP +.ds bcopy \f(CWbcopy\fP +.ds benchmp \f(CWbenchmp\fP +.ds bw_file_rd \f(CWbw_file_rd\fP +.ds bw_mem \f(CWbw_mem\fP +.ds bw_mmap_rd \f(CWbw_mmap_rd\fP +.ds bw_pipe \f(CWbw_pipe\fP +.ds bw_tcp \f(CWbw_tcp\fP +.ds bw_udp \f(CWbw_udp\fP +.ds bw_unix \f(CWbw_unix\fP +.ds close \f(CWclose\fP +.ds connect \f(CWconnect\fP +.ds dd \f(CWdd\fP +.ds execlp \f(CWexeclp\fP +.ds execve \f(CWexecve\fP +.ds exit \f(CWexit\fP +.ds fcntl \f(CWfcntl\fP +.ds fork \f(CWfork\fP +.ds fstat \f(CWfstat\fP +.ds gcc \f(CWgcc\fP +.ds get_n \f(CWget_n\fP +.ds getpid \f(CWgetpid\fP +.ds getppid \f(CWgetppid\fP +.ds gettime \f(CWgettime\fP +.ds gettimeofday \f(CWgettimeofday\fP +.ds kill \f(CWkill\fP +.ds lat_connect \f(CWlat_connect\fP +.ds lat_ctx \f(CWlat_ctx\fP +.ds lat_dram_page \f(CWlat_dram_page\fP +.ds lat_fcntl \f(CWlat_fcntl\fP +.ds lat_fifo \f(CWlat_fifo\fP +.ds lat_fs \f(CWlat_fs\fP +.ds lat_http \f(CWlat_http\fP +.ds lat_mem_rd \f(CWlat_mem_rd\fP +.ds lat_mmap \f(CWlat_mmap\fP +.ds lat_ops \f(CWlat_ops\fP +.ds lat_pagefault \f(CWlat_pagefault\fP +.ds lat_pipe \f(CWlat_pipe\fP +.ds lat_proc \f(CWlat_proc\fP +.ds lat_rpc \f(CWlat_rpc\fP +.ds lat_select \f(CWlat_select\fP +.ds lat_sem \f(CWlat_sem\fP +.ds lat_sig \f(CWlat_sig\fP +.ds lat_syscall \f(CWlat_syscall\fP +.ds lat_tcp \f(CWlat_tcp\fP +.ds lat_udp \f(CWlat_udp\fP +.ds lat_unix \f(CWlat_unix\fP +.ds lat_unix_connect \f(CWlat_unix_connect\fP +.ds lat_usleep \f(CWlat_usleep\fP +.ds line \f(CWline\fP +.ds lmdd \f(CWlmdd\fP +.ds lmdd \f(CWlmdd\fP +.ds mb \f(CWmb\fP +.ds memmove \f(CWmemmove\fP +.ds mhz \f(CWmhz\fP +.ds micro \f(CWmicro\fP +.ds mmap \f(CWmmap\fP +.ds nano \f(CWnano\fP +.ds nanosleep \f(CWnanosleep\fP +.ds open \f(CWopen\fP +.ds par_mem \f(CWpar_mem\fP +.ds par_ops \f(CWpar_ops\fP +.ds pipe \f(CWpipe\fP +.ds popen \f(CWpopen\fP +.ds pselect \f(CWpselect\fP +.ds read \f(CWread\fP +.ds select \f(CWselect\fP +.ds semop \f(CWsemop\fP +.ds setitimer \f(CWsetitimer\fP +.ds sh \f(CW/bin/sh\fP +.ds stat \f(CWstat\fP +.ds stream \f(CWstream\fP +.ds system \f(CWsystem\fP +.ds tlb \f(CWtlb\fP +.ds uiomove \f(CWuiomove\fP +.ds usleep \f(CWusleep\fP +.ds write \f(CWwrite\fP +.ds yield \f(CWyield\fP +.\" References stuff +.R1 +accumulate +sort A+DT +database references-lmbench3 +label-in-text +bracket-label [ ] ", " +.R2 +.EQ +delim $$ +.EN +.TL +\s(14lmbench3: measuring scalability\s0 +.AU +\s+2\fR\*[author]\fP\s0 +.AI +\fI\s+2Hewlett-Packard Laboratories Israel\s0\fP +.SP +.AB +\*[lmbench3] extends the \*[lmbench2] +system to measure a system's performance under scalable +load to make it possible to assess parallel and +distributed computer performance with the same +power and flexibility that \*[lmbench2] brought +to uni-processor performance analysis. +There is a new timing harness, \*[benchmp], designed +to measure performance at specific levels of parallel +(simultaneous) load, and most existing benchmarks have +been converted to use the new harness. +.SP +\*[lmbench] is a micro-benchmark suite designed to focus +attention on the basic building blocks of many +common system applications, such as databases, simulations, +software development, and networking. +It is also designed to make it easy for users to create +additional micro-benchmarks that can measure features, +algorithms, or subsystems of particular interest to the +user. +.AE +.if t .MC 3.05i +.NH 1 +Introduction +.LP +\*[lmbench] is a widely used suite of micro-benchmarks +that measures important aspects of computer system +performance, such as memory latency and bandwidth. +Crucially, the suite is written in portable ANSI-C +using POSIX interfaces and is intended to run on a +wide range of systems without modification. +.LP +The benchmarks included in the suite were chosen +because in the \*[lmbench] developer's experience, +they each represent an aspect of system performance +which has been crucial to an application's +performance. +Using this multi-dimensional performance analysis +approach, it is possible to better predict and +understand application performance because +key aspects of application performance can often be +understood as linear combinations of the elements +measured by \*[lmbench] +.[[ +Brown97 +.]]. +.LP +\*[lmbench3] extends the \*[lmbench] suite to +encompass parallel and distributed system performance +by measuring system performance under scalable load. +This means that the user can specify the number of +processes that will be executing the benchmarked +feature in parallel during the measurements. +It is possible to utilize this framework to develop +benchmarks to measure distributed application +performance, but it is primarily intended to +measure the performance of multiple processes +using the same system resource at the same time. +.LP +In general the benchmarks report either the latency +or bandwidth of an operation or data pathway. The +exceptions are generally those benchmarks that +report on a specific aspect of the hardware, such +as the processor clock rate, which is reported +in MHz and nanoseconds. +.LP +\*[lmbench] consists of three major components: +a timing harness, the individual benchmarks +built on top of the timing harness, and the +various scripts and glue that build and run the +benchmarks and process the results. +.NH 2 +\*[lmbench] history +.LP +\*[lmbench1] was written by Larry McVoy +while he was at Sun Microsystems. +It focussed on two measures of system performance: +latency and bandwidth. +It measured a number of basic operating system +functions, such as file system read/write bandwidth +or file creation time. +It also focussed a great deal of energy on measuring +data transfer operations, such as \*[bcopy] and +\*[pipe] latency and bandwidth as well as raw +memory latency and bandwidth. +.LP +Shortly after the \*[lmbench1] paper +.[[ +McVoy96 +.]] +was published, Aaron Brown examined the \*[lmbench] +benchmark suite and published a detailed critique +of its strengths and weaknesses +.[[ +Brown97 +.]]. +Largely in response to these remarks, development +of \*[lmbench2] began with a focus on improving the +experimental design and statistical data analysis. +The primary change was the development and adoption +across all the benchmarks of a timing harness that +incorporated loop-autosizing and clock resolution +detection. +In addition, each experiment was typically repeated +eleven times with the median result reported to the +user. +.LP +The \*[lmbench2] +.[[ +Staelin98 +.]] +timing harness was implemented through a new macro, +BENCH(), that automatically manages nearly all aspects +of accurately timing operations. +For example, it automatically detects the minimal timing +interval necessary to provide timing results within 1% +accuracy, and it automatically repeats most experiments +eleven times and reports the median result. +.LP +\*[lmbench3] focussed on extending +\*[lmbench]'s functionality along two dimensions: +measuring multi-processor scalability and measuring +basic aspects of processor micro-architecture. +.LP +An important feature of multi-processor systems is their +ability to scale their performance. +While \*[lmbench1] and \*[lmbench2] measure various +important aspects of system performance, they cannot +measure performance with more than one client process +active at a time. +Consequently, measuring performance of multi-processor +and clustered systems as a function of scalable load +was impossible using those tools. +.LP +\*[lmbench3] took the ideas and techniques +developed in the earlier versions and extended them +to create a new timing harness which can measure +system performance under parallel, scalable loads. +.LP +\*[lmbench3] also includes a version of John +McCalpin's STREAM benchmarks. Essentially the STREAM +kernels were placed in the new \*[lmbench] timing harness. +Since the new timing harness also measures scalability +under parallel load, the \*[lmbench3] STREAM +benchmarks include this capability automatically. +.LP +Finally, \*[lmbench3] includes a number of new +benchmarks which measure various aspects of the +processor architecture, such as basic operation +latency and parallelism, to provide developers +with a better understanding of system capabilities. +The hope is that better informed developers will +be able to better design and evaluate performance +critical software in light of their increased +understanding of basic system performance. +.NH 1 +Prior Work +.LP +Benchmarking is not a new field of endeavor. +There are a wide variety of approaches to +benchmarking, many of which differ greatly +from that taken by \*[lmbench]. +.LP +One common form of benchmark is to take an +important application or application and +worklist, and to measure the time required +to complete the entire task. +This approach is particularly useful when +evaluating the utility of systems for a +single and well-known task. +.LP +Other benchmarks, such as SPECint, use a +variation on this approach by measuring +several applications and combining the +results to predict overall performance. +SPEChpc96 +.[[ +SPEChpc96 +.]] +extends this approach to the +parallel and distributed domain by measuring +the performance of a selected parallel +applications built on top of MPI and/or PVM. +.\" .LP +.\" XXX Byte benchmark +.LP +Another variation takes the "kernel" of +an important application and measures its +performance, where the "kernel" is usually +a simplification of the most expensive +portion of a program. +Dhrystone +.[[ +Weicker84 +.]] +is an example of this type of +benchmark as it measures the performance +of important matrix operations and was often +used to predict system performance for +numerical operations. +.LP +Banga developed a benchmark to measure HTTP server +performance which can accurately measure +server performance under high load +.[[ +Banga97 +.]]. +Due to the idiosyncracies of the HTTP protocol +and TCP design and implementation, there are +generally operating system limits on the rate +at which a single system can generate +independent HTTP requests. +However, Banga developed a system which can +scalably present load to HTTP servers in spite +of this limitation +.[[ +Banga98 +.]]. +.LP +John McCalpin's STREAM benchmark measures +memory bandwidth during four common vector +operations +.[[ +McCalpin95 +.]]. +It does not measure memory latency, and +strictly speaking it does not measure raw +memory bandwith although memory bandwidth +is crucial to STREAM performance. +More recently, STREAM has been extended to +measure distributed application performance +using MPI to measure scalable memory subsystem +performance, particularly for multi-processor +machines. +.LP +Prestor +.[[ +Prestor01 +.]] +and Saavedra +.[[ +Saavedra95 +.]] +have developed benchmarks which analyze +memory subsystem performance. +.LP +Micro-benchmarking extends the "kernel" +approach, by measuring the performance +of operations or resources in isolation. +\*[lmbench] and many other benchmarks, such +as nfsstone +.[[ +Shein89 +.]], +measure the performance of key operations so +users can predict performance for certain +workloads and applications by combining the +performance of these operations in the right +mixture. +.LP +Saavedra +.[[ +Saavedra92 +.]] +takes the micro-benchmark approach and applies +it to the problem of predicting application +performance. +They analyze applications or other benchmarks +in terms of their ``narrow spectrum benchmarks'' +to create a linear model of the application's +computing requirements. +They then measure the computer system's +performance across this set of micro-benchmarks +and use a linear model to predict the application's +performance on the computer system. +Seltzer +.[[ +Seltzer99 +.]] +applied this technique using the features +measured by \*[lmbench] as the basis for +application prediction. +.LP +Benchmarking I/O systems has proven particularly +troublesome over the years, largely due to the +strong non-linearities exhibited by disk systems. +Sequential I/O provides much higher bandwidth +than non-sequential I/O, so performance is +highly dependent on the workload characteristics +as well as the file system's ability to +capitalize on available sequentiality by +laying out data contiguously on disk. +.LP +I/O benchmarks have a tendency to age poorly. +For example, IOStone +.[[ +Park90a +.]], +IOBench +.[[ +Wolman89 +.]], +and the Andrew benchmark +.[[ +Howard88 +.]] +used fixed size datasets, whose size was +significant at the time, but which no longer +measure I/O performance as the data can now +fit in the processor cache of many modern +machines. +.LP +The Andrew benchmark attempts to separately +measure the time to create, write, re-read, +and then delete a large number of files in +a hierarchical file system. +.LP +Bonnie +.[[ +Bray90 +.]] +measures sequential, streaming I/O bandwidth +for a single process, and random I/O latency +for multiple processes. +.LP +Peter Chen developed an adaptive harness for +I/O benchmarking +.[[ +Chen93d +.]] +.[[ +Chen94a +.]], +which defines I/O load in terms of five parameters, +uniqueBytes, sizeMean, readFrac, seqFrac, and +processNum. The benchmark then explores the +parameter space to measure file system performance +in a scalable fashion. +.LP +Parkbench +.[[ +Parkbench +.]] +is a benchmark suite that can analyze parallel +and distributed computer performance. +It contains a variety of benchmarks that measure +both aspects of system performance, such as +communication overheads, and distributed application +kernel performance. +Parkbench contains benchmarks from both NAS +.[[ +NAS +.]] +and Genesis +.[[ +Glendinning94 +.]]. +.NH 1 +Timing Harness +.LP +The first, and most crucial element in extending +\*[lmbench2] so that it could measure scalable +performance, was to develop a new timing harness +that could accurately measure performance for +any given load. +Once this was done, then each benchmark would +be migrated to the new timing harness. +.LP +The harness is designed to accomplish a number +of goals: +.IP 1. +during any timing interval of any child it is +guaranteed that all other child processes are +also running the benchmark +.IP 2. +the timing intervals are long enough to average +out most transient OS scheduler affects +.IP 3. +the timing intervals are long enough to ensure +that error due to clock resolution is negligible +.IP 4. +timing measurements can be postponed to allow +the OS scheduler to settle and adjust to the +load +.IP 5. +the reported results should be representative +and the data analysis should be robust +.IP 6. +timing intervals should be as short as possible +while ensuring accurate results +.LP +Developing an accurate timing harness with a +valid experimental design is more difficult +than is generally supposed. +Many programs incorporate elementary timing +harnesses which may suffer from one or more +defects, such as insufficient care taken to +ensure that the benchmarked operation is run +long enough to ensure that the error introduced +by the clock resolution is insignificant. +The basic elements of a good timing harness +are discussed in +Staelin +.[[ +Staelin98 +.]]. +.LP +The new timing harness must also collect and process +the timing results from all the child processes so +that it can report the representative performance. +It currently reports the median performance over +all timing intervals from all child processes. It +might perhaps be argued that it should report the +median of the medians. +.LP +When running benchmarks with more than one child, +the harness must first get a baseline estimate +of performance by running the benchmark in only +one process using the standard \*[lmbench] timing +interval, which is often 5,000 microseconds. +Using this information, the harness can compute +the average time per iteration for a single +process, and it uses this figure to compute the +number of iterations necessary to ensure that +each child runs for at least one second. +.NH 2 +Clock resolution +.LP +\*[lmbench] uses the \*[gettimeofday] clock, whose +interface resolves time down to 1 microsecond. +However, many system clock's resolution is only 10 +milli-seconds, and there is no portable way to query +the system to discover the true clock resolution. +.LP +The problem is that the timing intervals must +be substantially larger than the clock resolution +in order to ensure that the timing error doesn't +impact the results. For example, the true duration +of an event measured with a 10 milli-second clock +can vary $+-$10 milli-seconds from the true time, +assuming that the reported time is always a +truncated version of the true time. If the clock +itself is not updated precisely, the true error +can be even larger. +This implies that timing intervals on these systems +should be at least 1 second. +.LP +However, the \*[gettimeofday] clock resolution in +most modern systems is 1 microsecond, so timing +intervals can as small as a few milli-seconds +without incurring significant timing errors related +to clock resolution. +.LP +Since there is no standard interface to query the operating +system for the clock resolution, \*[lmbench] must +experimentally determine the appropriate timing +interval duration which provides results in a timely +fashion with a negligible clock resolution error. +.NH 2 +Coordination +.LP +Developing a timing harness that correctly manages +$N$ processes and accurately measures system performance +over those same $N$ processes is significantly more difficult +than simply measuring system performance with a single +process because of the asynchronous nature of +parallel programming. +.LP +In essence, the new timing harness needs to create +$N$ jobs, and measure the average performance of the +target subsystem while all $N$ jobs are running. This +is a standard problem for parallel and distributed +programming, and involves starting the child +processes and then stepping through a handshaking +process to ensure that all children have started +executing the benchmarked operation before any child +starts taking measurements. +.TSTART +.TS +box tab (/) box expand ; +c c +l l . +Parent/Child +T{ +\(bu start up P child processes +T}/ +T{ +\(bu wait for P \fIready\fR signals +T}/T{ +\(bu run benchmark operation for a little while +T} +\(da/T{ +\(bu send a \fIready\fR signal +T} +T{ +\(bu on reciept of \fIready\fR signals, sleep for \fIwarmup\fR \*[micro]s +T}/T{ +\(bu run benchmark operation while polling for a \fIgo\fR signal +T} +T{ +\(bu send \fIgo\fR signal to P children +T}/\(da +T{ +\(bu wait for P \fIdone\fR signals +T}/T{ +\(bu on receipt of \fIgo\fR signal, begin timing benchmark operation +T} +\(da/T{ +\(bu send a \fIdone\fR signal +T} +T{ +\(bu one receipt of \fIdone\fR signals, iterate through children +sending \fIresults\fR signal and gathering results +T}/T{ +\(bu run benchmark operation while polling for a \fIresults\fR signal +T} +T{ +\(bu collate results +T}/T{ +\(bu on receipt of \fIresults\fR signal, send timing results +and wait for \fIexit\fR signal +T} +T{ +\(bu send \fIexit\fR signal +T}/\(da +/T{ +\(bu exit +T} +.TE +.TEND "Timing harness sequencing" +.nr TABLEseq \n[TABLE] +.LP +Table \n[TABLEseq] shows how the parent and child +processes coordinate their activities to ensure +that all children are actively running the +benchmark activity while any child could be +taking timing measurements. +.LP +The reason for the separate "exit" signal is +to ensure that all properly managed children +are alive until the parent allows them to die. +This means that any SIGCHLD events that occur +before the "exit" signal indicate a child +failure. +.NH 2 +Accuracy +.LP +The new timing harness also needs to ensure that the +timing intervals are long enough for the results to +be representative. The previous timing harness assumed +that only single process results were important, and +it was able to use timing intervals as short as +possible while ensuring that errors introduced by +the clock resolution were negligible. +In many instances this meant that the timing intervals +were smaller than a single scheduler time slice. +The new timing harness must run benchmarked operations +long enough to ensure that timing intervals are longer +than a single scheduler time slice. +Otherwise, you can get results which are complete nonsense. +For example, running several copies of an \*[lmbench2] +benchmark on a uni-processor machine will often report +that the per-process performance with $N$ jobs running in +parallel is equivalent to the performance with a single +job running!\** +.FS +This was discovered by someone who naively attempted +to parallelize \*[lmbench2] in this fashion, and I +received a note from the dismayed developer describing +the failed experiment. +.FE +.LP +In addition, since the timing intervals now have to be +longer than a single scheduler time slice, they also +need to be long enough so that a single scheduler time +slice is insignificant compared to the timing interval. +Otherwise the timing results can be dramatically +affected by small variations in the scheduler's +behavior. +.LP +Currently \*[lmbench] does not measure the scheduler +timeslice; the design blithely assumes that timeslices +are generally on the order of 10-20ms, so one second +timing intervals are sufficient. +Some schedulers may utilize longer time slices, but +this has not (yet) been a problem. +.NH 2 +Resource consumption +.LP +One important design goal was that resource consumption +be constant with respect to the number of child +processes. +This is why the harness uses shared pipes to communicate +with the children, rather than having a separate set of +pipes to communicate with each child. +An early design of the system utilized a pair of pipes +per child for communication and synchronization between +the master and slave processes. However, as the number +of child processes grew, the fraction of system +resources consumed by the harness grew and the additional +system overhead could start to interfere with the accuracy +of the measurements. +.LP +Additionally, if the master has to poll (\*[select]) +$N$ pipes, then the system overhead of that operation +also scales with the number of children. +.NH 2 +Pipe atomicity +.LP +Since all communication between the master process and +the slave (child) processes is done via a set of shared +pipes, we have to ensure that we never have a situation +where the message can be garbled by the intermingling +of two separate messages from two separate children. +This is ensured by either using pipe operations that +are guaranteed to be atomic on all machines, or by +coordinating between processes so that at most one +process is writing at a time. +.LP +The atomicity guarantees are provided by having each +client communicate synchronization states in one-byte +messages. For example, the signals from the master +to each child are one-byte messages, so each child +only reads a single byte from the pipe. Similarly, +the responses from the children back to the master +are also one-byte messages. In this way no child +can receive partial messages, and no message can +be interleaved with any other message. +.LP +However, using this design means that we need to +have a separate pipe for each \fIbarrier\fR in +the process, so the master uses three pipes to +send messages to the children, namely: \fIstart_signal\fR, +\fIresult_signal\fR, and \fIexit_signal\fR. +If a single pipe was used for all three barrier events, +then it is possible for a child to miss a signal, +or if the signal is encoded into the message, +then it is possible for a child to infinite loop +pulling a signal off the pipe, recognizing that +it has already received that signal so that it +needs to push it back into the pipe, and then +then re-receiving the same message it just re-sent. +.LP +However, all children share a single pipe to send +data back to the master process. Usually the +messages on this pipe are single-byte signals, +such as \fIready\fR or \fIdone\fR. However, the +timing data results need to be sent from the +children to the master and they are (much) larger +than a single-byte message. In this case, the +timing harness sends a single-byte message on +the \fIresult_signal\fR channel, which can be +received by at most one child process. This +child then knows that it has sole ownership of +the response pipe, and it writes its entire +set of timing results to this pipe. Once the +master has received all of the timing results +from a single child, it sends the next one-byte +message on the \fIresult_signal\fR channel to +gather the next set of timing results. +.TSTART 1 +.so lmbench3_signals.pic +.FEND "Control signals" 1 +.nr FIGUREsig \n[FIGURE] +.LP +The design of the signals is shown in Figure \n[FIGUREsig]. +.NH 2 +Benchmark initialization +.LP +By allowing the benchmark to specify an +initialization routine that is run in the +child processes, the new timing harness +allows benchmarks to do either or both +global initializations that are shared +by all children and specific per-child +initializations that are done independently +by each child. +Global initialization is done in the +master process before the \*[benchmp] +harness is called, so the state is +preserved across the \*[fork] operations. +Per-child initialization is done inside +the \*[benchmp] harness by the optional +initialization routine and is done after +the \*[fork] operation. +.LP +Similarly, each benchmark is allowed to +specify a cleanup routine that is run by +the child processes just before exiting. +This allows the benchmark routines to +release any resources that they may have +used during the benchmark. +Most system resources would be automatically +released on process exit, such as file +descriptors and shared memory segments, +but some resources such as temporary files +might need to be explicitly released by +the benchmark. +.NH 2 +Scheduler transients +.LP +Particularly on multi-processor systems, side-effects +of process migration can dramatically affect program +runtimes. For example, if the processes are all +initially assigned to the same processor as the parent +process, and the timing is done before the scheduler +migrates the processes to other available processors, +then the system performance will appear to be that of +a uniprocessor. Similarly, if the scheduler is +over-enthusiastic about re-assigning processes to +processors, then performance will be worse than +necessary because the processes will keep encountering +cold caches and will pay exhorbitant memory access +costs. +.LP +The first case is a scheduler transient, and users +may not want to measure such transient phenomena +if their primary interest is in predicting performance +for long-running programs. Conversely, that same +user would be extraordinarily interested in the +second phenomena. The harness was designed to +allow users to specify that the benchmarked processes +are run for long enough to (hopefully) get the +scheduler past the transient startup phase, so it +can measure the steady-state behavior. +.NH 2 +Data analysis +.LP +Analyzing the data to produce representative results +is a crucial step in the benchmarking process. +\*[lmbench] generally reports the \fImedian\fP +result for $11$ measurements. +Most benchmarks report the results of a single measurement +.[[ +Howard88 +.]], +an average of several results +.[[ +McCalpin95 +.]], +or a trimmed mean +.[[ +Brown97 +.]]. +.\" XXX UNKNOWN: +.\" .RN Weicker84,Shein89,Park,Wolman89,Banga97,Saavedra92,Chen94a,Bray90 +.LP +Since \*[lmbench] is able to use timing intervals +that are often smaller than a scheduler time slice +when measuring single-process performance, the raw +timing results are often severely skewed. +Often most results cluster around a single value +a small number of outliers with significantly +larger values. +The median is preferable to the mean when the data +can be very skewed +.[[ +Jain91 +.]]. +Since the timing intervals are significantly longer +when the desired load is larger than a single +process, the results tend not to be as badly skewed. +In these cases we could use the \fImean\fR instead, +but we decide to use a uniform statistical framework, +so we usually use the median. +.LP +In some instances, however, \*[lmbench] internally +uses the \fIminimum\fP rather than the median, +such as in \*[mhz]. +In those instances, we are not trying to find the +\fIrepresentative\fP value, but rather the +\fIminimum\fP value. +There are only a few sources of error which could +cause a the measured timing result to be shorter +than the true elapsed time: the system clock is +adjusted, or round-off error in the clock resolution. +The timing interval duration is set to ensure that +the round-off error is bounded to 1% of the timing +interval, and we blithely assume that people don't +reset their system clocks while benchmarking their +systems. +.LP +\*[lmbench] does not currently report any statistics +representing measurement variation, such as the +difference between the first and third quartiles. +This is an enhancement under active consideration. +.NH 1 +Interface +.LP +Unfortunately we had to move away from the +macro-based timing harness used in \*[lmbench2] +and migrate to a function-based system +because the macros were too large for some +C pre-processors. +.TSTART 1 +.DS L +\f(CWtypedef void (*bench_f)(iter_t iters, + void* cookie); +typedef void (*support_f)(void* cookie); + +extern void benchmp(support_f initialize, + bench_f benchmark, + support_f cleanup, + int enough, + int parallel, + int warmup, + int repetitions, + void* cookie); + +extern uint64 gettime(); +extern uint64 get_n(); +extern void nano(char* s, uint64 n); +extern void micro(char* s, uint64 n); +extern void mb(uint64 bytes);\fP +.DE +.FEND "Programming interface" 1 +.nr FIGinterface \n[FIGURE] +.LP +Figure \n[FIGinterface] shows the key elements +of the new timing harness and result reporting +interface. +A brief description of the \*[benchmp] parameters: +.IP \fIenough\fR +Enough can be used to ensure that a timing interval is at +least 'enough' microseconds in duration. For most benchmarks +this should be zero, but some benchmarks have to run for more +time due to startup effects or other transient behavior. +.IP \fIparallel\fR +is simply the number of instances of the benchmark +that will be run in parallel on the system. +.IP \fIwarmup\fR +can be used to force the benchmark to run for warmup +microseconds before the system starts making timing measurements. +Note that it is a lower bound, not a fixed value, since it +is simply the time that the parent sleeps after receiving the +last "ready" signal from each child (and before it sends +the "go" signal to the children). +.IP \fIrepetitions\fR +is the number of times the experiment should +be repeated. The default is eleven. +.IP \fIcookie\fR +is a pointer that can be used by the benchmark +writer to pass in configuration information, such as buffer +size or other parameters needed by the inner loop. +In \*[lmbench3] it is generally used to point +to a structure containing the relevant configuration +information. +.LP +\*[gettime] returns the median timing +interval duration, while \*[get_n] returns +the number of iterations executed during +that timing interval. +.LP +\*[nano] and \*[micro] print the passed +string latency followed by the latency +in terms of nanoseconds and microseconds +respectively. +The latency is computed as $gettime()/n$, +where $n$ is the passed parameter. +The reason $n$ is passed as a parameter +is because the benchmark can actually +execute the operation of interest multiple +times during a single iteration. +For example, the memory latency benchmarks +typically repeat the memory load operation +a hundred times inside the loop, so the +actual number of operations is +$100 times get_n()$, and it is this value +that should be passed to \*[nano] or \*[micro]. +.LP +\*[mb] reports the bandwidth in MB/s +when given the total number of bytes +processed during the timing interval. +Note that for scalable benchmarks that +process $"size"$ bytes per iteration, the +total number of bytes processed is +$get_n() times parallel times "size"$. +.TSTART 1 +.DS L +\f(CW#include "bench.h" + +void +bench(iter_t iters, void* cookie) +{ + while (iters-- > 0) { + getppid(); + } +} + +int +main(int argc, char* argv[]) +{ + benchmp(NULL, bench, NULL, + 0, 1, 0, TRIES, NULL); + nano("getppid", get_n()); + return(0); +}\fP +.DE +.FEND "A sample benchmark" 1 +.nr FIGsample \n[FIGURE] +.LP +Figure \n[FIGsample] shows a sample benchmark +that measures the latency of the \*[getppid] +system call using this timing harness. +Since there is no setup or cleanup needed +for this benchmark, the \fIinitialize\fR +and \fIcleanup\fR parameters are NULL. +The \fIbench\fR routine simply calls +\*[getppid] as many times as requested, +and the rest of the parameters, \fIenough\fR, +\fIparallel\fR, \fIwarmup\fR, +\fIrepetitions\fR, and \fIcookie\fR +are given with the default values. +.NH 1 +Benchmarks +.LP +\*[lmbench] contains a large number of micro-benchmarks +that measure various aspects of hardware and operating +system performance. The benchmarks generally measure +latency or bandwidth, but some new benchmarks also +measure instruction-level parallelism. +.TSTART +.TS +center box tab (&); +c c +l & l . +Name&Measures +_ +&\fBBandwidth\fR +\fIbw_file_rd\fR&T{ +\*[read] and then load into processor +T} +\fIbw_mem\fR&T{ +read, write, and copy data to/from memory +T} +\fIbw_mmap_rd\fR&read from \*[mmap]'ed memory +\fIbw_pipe\fR&\*[pipe] inter-process data copy +\fIbw_tcp\fR&TCP inter-process data copy +\fIbw_unix\fR&UNIX inter-process +_ +&\fBLatency\fR +lat_connect&TCP connection +\fIlat_ctx\fR&T{ +context switch via \*[pipe]-based ``hot-potato'' token passing +T} +lat_dram_page&T{ +DRAM page open +T} +\fIlat_fcntl\fR&T{ +\*[fcntl] file locking ``hot-potato'' token passing +T} +\fIlat_fifo\fR&T{ +FIFO ``hot-potato'' token passing +T} +lat_fs&file creation and deletion +lat_http&http GET request latency +\fIlat_mem_rd\fR&memory read +\fIlat_mmap\fR&\*[mmap] operation +\fIlat_ops\fR&T{ +basic operations (\fIxor\fR, \fIadd\fR, \fImul\fR, \fIdiv\fR, \fImod\fR) +on (relevant) basic data types (\fIint\fR, \fIint64\fR, \fIfloat\fR, +\fIdouble\fR) +T} +\fIlat_pagefault\fR&page fault handler +\fIlat_pipe\fR&\*[pipe] ``hot-potato'' token passing +\fIlat_pmake\fR&T{ +time to complete $N$ parallel jobs that each do $usecs$-worth of work +T} +\fIlat_proc\fR&T{ +procedure call overhead and process creation using \*[fork], +\*[fork] and \*[execve], and \*[fork] and \*[sh] +T} +\fIlat_rand\fR&T{ +random number generator +T} +\fIlat_rpc\fR&SUN RPC procedure call +\fIlat_select\fR&\*[select] operation +\fIlat_sem\fR&T{ +semaphore ``hot-potato'' token passing +T} +\fIlat_sig\fR&T{ +signal handle installation and handling +T} +\fIlat_syscall\fR&T{ +\*[open], \*[close], \*[getppid], \*[write], \*[stat], \*[fstat] +T} +\fIlat_tcp\fR&TCP ``hot-potato'' token passing +\fIlat_udp\fR&UDP ``hot-potato'' token passing +\fIlat_unix\fR&UNIX ``hot-potato'' token passing +\fIlat_unix_connect\fR&UNIX socket connection +\fIlat_usleep\fR&T{ +\*[usleep], \*[select], \*[pselect], \*[nanosleep], \*[setitimer] +timer resolution +T} +_ +&\fBOther\fR +disk&T{ +zone bandwidths and seek times +T} +line&cache line size +lmdd&\fIdd\fR clone +par_mem&memory subsystem ILP +par_ops&basic operation ILP +\fIstream\fR&STREAM clones +tlb&TLB size +.TE +.TEND "\*[lmbench] micro-benchmarks" +.nr TABLEbench \n[TABLE] +.LP +Table \n[TABLEbench] contains the full list of micro-benchmarks +in \*[lmbench3]. +Benchmarks that were converted to measure performance +under scalable load are shown in italics, while the +remaining benchmarks are shown with normal typeface. +A detailed description of most benchmarks can be found in +.[[ +McVoy96 +.]]. +.NH 1 +Scaling Benchmarks +.LP +There are a number of issues associated with converting +single-process benchmarks with a single process to +scalable benchmarks with several independent processes, +in addition to the various issues addressed by +the timing harness. +Many of the benchmarks consume or utilize system +resources, such as memory or network bandwidth, +and a careful assessment of the likely resource +contention issues is necessary to ensure that the +benchmarks measure important aspects of system performance +and not artifacts of artificial resource contention. +.LP +For example, the Linux 2.2 and 2.4 kernels use a single lock to +control access to the kernel data structures for a file. +This means that multiple processes accessing that file +will have their operations serialized by that lock. +If one is interested in how well a system can handle +multiple independent accesses to separate files and +if the child processes all access the same file, then +this file sharing is an artificial source of contention +with potentially dramatic effects on the benchmark +results. +.NH 2 +File System +.LP +A number of the benchmarks measure aspects of file system +performance, such as \*[bw_file_rd], \*[bw_mmap_rd], +\*[lat_mmap], and \*[lat_pagefault]. +It is not immediately apparent how these benchmarks should +be extended to the parallel domain. For example, it may +be important to know how file system performance scales +when multiple processes are reading the same file, or +when multiple processes are reading different files. +The first case might be important for large, distributed +scientific calculations, while the second might be more +important for a web server. +.LP +However, for the operating system, the two cases are +significantly different. When multiple processes +access the same file, access to the kernel data +structures for that file must be coordinated and +so contention and locking of those structures can +impact performance, while this is less true when +multiple processes access different files. +.LP +In addition, there are any number of issues associated +with ensuring that the benchmarks are either measuring +operating system overhead (e.g., that no I/O is actually +done to disk), or actually measuring the system's I/O +performance (e.g., that the data cannot be resident in +the buffer cache). Especially with file system related +benchmarks, it is very easy to develop benchmarks that +compare apples and oranges (e.g., the benchmark includes +the time to flush data to disk on one system, but only +includes the time to flush a portion of data to disk on +another system). +.LP +\*[lmbench3] allows the user to measure either case +as controlled by a command-line switch. When measuring +accesses to independent files, the benchmarks first +create their own private copies of the file, one for +each child process. Then each process accesses its +private file. When measuring accesses to a single +file, each child simply uses the designated file +directly. +.NH 2 +Context Switching +.LP +Measuring context switching accurately is a difficult +task. \*[lmbench1] and \*[lmbench2] measured context +switch times via a "hot-potato" approach using pipes +connected in a ring. However, this experimental +design heavily favors schedulers that do "hand-off" +scheduling, since at most one process is active at +a time. +Consequently, it is not really a good benchmark +for measuring scheduler overhead in multi-processor +machines. +.LP +The design currently used in \*[lmbench3] is to +create $N$ \*[lmbench2]-style process rings and +to measure the context switch times with all $N$ +rings running in parallel. +This does extend the \*[lmbench2] context switch +benchmark to a scalable form, but it still suffers +from the same weaknesses. +.LP +One approach that was considered was to replace +the ring with a star formation, so the master +process would send tokens to each child and +then wait for them all to be returned. +This has the advantage that more than one process +is active at a time, reducing the sensitivity +to "hand-off" scheduling. +However, this same feature can cause problems +on a multi-processor system because several +of the context switches and working set accesses +can occur in parallel. +.LP +The design and methodology for measuring context +switching and scheduler overhead need to be revisited +so that it can more accurately measure performance +for multi-processor machines. +.NH 1 +Stream +.LP +\*[lmbench3] includes a new micro-benchmark, +\*[stream] which measures the performance of +John McCalpin's STREAM benchmark kernels for +both STREAM version 1 +.[[ +McCalpin95 +.]] +and version 2 +.[[ +McCalpin2002 +.]]. +This benchmark faithfully recreates each of the +kernel operations from both STREAM benchmarks, +and because of the powerful new timing harness it +can easily measure memory system scalability. +.TSTART +.TS +center box tab (|); +c s s s s +c | c | c s | c +l | l | l | l | l . +Stream +_ +Kernel|Code|Bytes|FL +||rd|wr|OPS +_ +COPY|$a[i]=b[i]$|8(+8)|8|0 +SCALE|$a[i]=q times b[i]$|8(+8)|8|1 +ADD|$a[i]=b[i]+c[i]$|16(+8)|8|1 +TRIAD|$a[i]=b[i]+q times c[i]$|16(+8)|8|2(-1) +.TE +.TS +center box tab (|); +c s s s s +c | c | c s | c +l | l | l | l | l . +Stream2 +_ +Kernel|Code|Bytes|FL +||rd|wr|OPS +_ +FILL|$a[i]=q$|0(+8)|8|0 +COPY|$a[i]=b[i]$|8(+8)|8|0 +DAXPY|$a[i]=a[i]+q times b[i]$|16|8|2(-1) +SUM|$sum=sum + a[i]$|8|0|1 +.TE +.TEND "Stream operations" +.LP +Table \n[TABLE] is based on McCalpin's tables +.[[ +McCalpin95 +.]] +.[[ +McCalpin2002 +.]] +and shows the four kernels for each version +of the \*[stream] benchmark. +Note that the +.I read +columns include numbers in parentheses, which +represent the average number of bytes read into +the cache as a result of the write to that +variable\**. +.FS +This number is independent of the cache +line size because the STREAM uses dense +arrays, so the cost is amortized over the +subsequent operations on the rest of the +line. +.FE +Cache lines are almost invariably +bigger than a single double, and so when a +write miss occurs the cache will read the line +from memory and then modify the selected bytes. +Sometimes vector instructions such as SSE +and 3DNow can avoid this load by writing an +entire cache line at once. +.LP +In addition, some architectures support +multiply-add instructions which can do +both the multiply and add operations for +TRIAD and DAXPY in a single operation, +so the physical FLOPS count would be 1 +for these architectures on these +instructions. +The numbers in parenthesis in the +.I FLOPS +column reflect this reduction in +FLOPS count. +.LP +Following the STREAM bandwidth reporting +conventions, the \*[lmbench] STREAM benchmarks +report their results as bandwidth results +(MB/s) computed as a function of the amount +of data explicitly read or written by the +benchmark. +For example, \fIcopy\fR and \fIscale\fR copy +data from one array to the other, so the +bandwidth is measured as a function of the +amount of data read plus the amount of data +written, or the sum of the two array sizes. +Similarly, \fIsum\fR, \fItriad\fR, and \fIdaxpy\fR +operate on three arrays, so the amount of data +transferred is the sum of the sizes of the three +arrays. +Note that the actual amount of data that is +transferred by the system may be larger +because in the write path the cache may +need to fetch (read) the cache line before +a portion of it is overwritten by dirty data. +.NH 1 +Unscalable benchmarks +.LP +There are a number of benchmarks which either +did not make sense for scalable load, such as +\*[mhz], or which could not +be extended to measure scalable load due to +other constraints, such as \*[lat_connect]. +.LP +\*[mhz] measures the processor clock speed, +which is not a scalable feature of the system, +so it doesn't make any sense to create a +version of it that measures scalable performance. +.LP +More specifically, \*[lat_connect] measures +the latency of connecting to a TCP socket. +TCP implementations have a timeout on +sockets and there is generally a fixed size +queue for sockets in the TIMEOUT state. +This means that once the queue has been +filled by a program connecting and closing +sockets as fast as possible, then all new +socket connections have to wait TIMEOUT +seconds. Needless to say, this gives no +insight into the latency of socket creation +per se, but is rather a boring artifact. +Since the \*[lmbench2] version of the +benchmark can run for very short periods +of time, it generally does not run into +this problem and is able to correctly +measure TCP connection latency. +.LP +Any scalable version of the benchmark needs +each copy to run for at least a second, and +there are $N$ copies creating connections as +fast as possible, so it would essentially be +guaranteed to run into the TIMEOUT problem. +Consequently, \*[lat_connect] was not +enhanced to measure scalable performance. +.LP +\*[lat_fs] has not yet been parallelized because +of the difficulty in measuring file creation and +file deletion times in the new timing harness. +The timing harness assumes that it can ask the +benchmarked operation to be repeated as many times +as necessary. +This would mean that the file creation benchmark +could create any number of new files of a given +size, which could well fill up the file system. +The real problem lies in the file deletion benchmark. +In order to delete files of a given size, they +must have been created before the benchmark begins. +However, the number of files is not known in +advance, so the benchmark would have a difficult +time ensuring that it has created enough files. +.LP +The benchmarks that measure aspects of memory-subsystem +micro-architecture, \*[lat_dram_page], \*[line], +\*[par_mem], and \*[tlb], were not parallelized because +the multiple processes' memory access patterns would +likely interfere with one another. +For example, in \*[lat_dram_page], those accesses +which were supposed to be to open DRAM pages could +well be accessing closed DRAM pages, invalidating +the benchmark. +.LP +\*[lmdd] was not parallelized because it is +supposed to be a clone of \*[dd], and it +wasn't clear what a parallel form of \*[dd] +would look like. +.NH 1 +Results +.LP +The results presented here were obtained using +\*[lmbench] version 3.0-a2 under +Linux 2.4.18-6mdk on a two processor 450MHz PIII +running a stock Mandrake 8.2 Linux 2.4.18 kernel. +.TSTART +.TS +center box tab (&); +c | c s +l | l | l. +Benchmark&Latency ($mu$s) +_ +&1 process&2 processes +_ +null call&0.79&0.81 +null I/O&1.39&2.39 +stat&9.26&25.9 +open/close&11.7&27.1 +select (TCP)&55.3&58.6 +signal install&1.89&1.95 +signal handler&6.34&7.21 +fork process&793.&868. +exec process&2474&2622 +sh process&24.K&25.K +pipe&17.7&23.3 +unix socket&51.6&37.6 +UDP&70.2&70.6 +TCP&91.2&92.3 +rpc (UDP)&120.0&120.4 +rpc (TCP)&157.1&159.1 +.TE +.TEND "Latency results" +.nr TABLElatency \n[TABLE] +.TSTART +.TS +center box tab (&); +c | c s +l | l | l. +Benchmark&Bandwidth (MB/s) +_ +&1 process&2 processes +_ +pipe&155&268 +unix socket&142&179 +TCP&57.5&57.8 +bcopy(libc)&134&175 +bcopy(hand)&144&174 +memory read&319&486 +memory write&199&202 +STREAM copy&288.68&367.99 +STREAM scale&290.39&369.08 +STREAM sum&337.75&415.54 +STREAM triad&246.90&380.09 +STREAM2 fill&198.96&276.28 +STREAM2 copy&288.55&359.93 +STREAM2 daxpy&318.98&493.79 +STREAM2 sum&354.03&512.05 +.TE +.TEND "Bandwidth results" +.nr TABLEbandwidth \n[TABLE] +.TSTART +.TS +center box tab (&); +c | c s s +l | l | l | l. +Benchmark&Load +_ +&1&2&2clone +_ +bw_file_rd&151.04&266.74&273.51 +bw_mmap_rd&316.08&480.02&482.57 +lat_mmap&615&878&786 +lat_pagefault&2.9802&3.9159&3.4589 +.TE +.TEND "File bandwidth results" +.nr TABLEfile \n[TABLE] +.LP +Table \n[TABLElatency] shows the latency of +various system and communication operations +for both 1 and 2 process loads, while +Table \n[TABLEbandwidth] shows the bandwidth +of various data operations and +Table \n[TABLEfile] shows how various file +system operations scale. +Table \n[TABLEfile] shows system performance +with one process, two processes sharing the +same file, and two processes accessing their +own files. +.TSTART 1 +.G1 +label left "Latency (ns)" +label bottom "Memory size (MB)" +coord x 0.0004,32 y 5,300 log x +draw solid +0.00049 6.680 +0.00098 6.683 +0.00195 6.680 +0.00293 6.680 +0.00391 6.681 +0.00586 6.681 +0.00781 6.681 +0.00977 6.684 +0.01172 6.683 +0.01367 6.690 +0.01562 6.725 +0.01758 48.977 +0.01953 49.051 +0.02148 49.043 +0.02344 49.025 +0.02539 48.889 +0.02734 48.880 +0.02930 48.902 +0.03125 49.020 +0.03516 49.043 +0.03906 48.904 +0.04297 49.044 +0.04688 49.027 +0.05078 49.046 +0.05469 48.889 +0.05859 49.018 +0.06250 49.012 +0.07031 49.025 +0.07812 49.030 +0.08594 48.936 +0.09375 49.042 +0.10156 49.022 +0.10938 48.889 +0.11719 49.073 +0.12500 48.998 +0.14062 49.043 +0.15625 49.125 +0.17188 49.160 +0.18750 49.113 +0.20312 49.123 +0.21875 48.991 +0.23438 49.045 +0.25000 49.184 +0.28125 49.971 +0.31250 57.735 +0.34375 72.668 +0.37500 79.106 +0.40625 77.612 +0.43750 78.764 +0.46875 88.636 +0.50000 104.024 +1.00000 179.817 +1.50000 182.297 +2.00000 182.043 +2.50000 182.902 +3.00000 183.130 +3.50000 184.333 +4.00000 182.868 +5.00000 183.319 +6.00000 183.208 +7.00000 183.688 +8.00000 183.871 +10.00000 183.659 +12.00000 183.583 +14.00000 183.773 +16.00000 183.828 +18.00000 183.894 +20.00000 183.933 +30.00000 183.971 +new dashed +0.00049 6.811 +0.00098 6.815 +0.00195 6.825 +0.00293 6.807 +0.00391 6.803 +0.00586 6.822 +0.00781 6.826 +0.00977 6.825 +0.01172 6.922 +0.01367 6.825 +0.01562 6.866 +0.01758 49.954 +0.01953 49.989 +0.02148 50.021 +0.02344 50.019 +0.02539 50.003 +0.02734 50.085 +0.02930 50.000 +0.03125 50.187 +0.03516 49.988 +0.03906 50.032 +0.04297 49.986 +0.04688 50.186 +0.05078 50.196 +0.05469 50.107 +0.05859 50.087 +0.06250 49.983 +0.07031 50.092 +0.07812 50.135 +0.08594 50.057 +0.09375 50.188 +0.10156 65.950 +0.10938 55.614 +0.11719 54.328 +0.12500 61.700 +0.14062 59.710 +0.15625 52.637 +0.17188 82.911 +0.18750 74.304 +0.20312 72.371 +0.21875 78.124 +0.23438 74.577 +0.25000 96.374 +0.28125 110.708 +0.31250 97.832 +0.34375 103.006 +0.37500 129.292 +0.40625 140.816 +0.43750 165.255 +0.46875 164.632 +0.50000 170.912 +1.00000 233.968 +1.50000 285.445 +2.00000 241.341 +2.50000 263.436 +3.00000 273.101 +3.50000 269.926 +4.00000 233.626 +5.00000 222.305 +6.00000 293.832 +7.00000 238.863 +8.00000 245.026 +10.00000 282.297 +12.00000 239.152 +14.00000 274.218 +16.00000 226.299 +18.00000 284.183 +20.00000 224.596 +30.00000 236.416 +"1 process" at 5,165 +"2 processes" at 0.3,280 +.G2 +.FEND "Memory subsystem performance" 1 +.nr FIGUREmem \n[FIGURE] +.LP +Figure \n[FIGUREmem] shows the memory latency +curves with 32 byte strides for one and two +process loads versus memory size. +.NH 1 +Conclusion +.LP +\*[lmbench] is a useful, portable micro-benchmark +suite designed to measure important aspects of +system performance. +\*[lmbench3] adds a number of important extensions, +such as the ability to measure system scalability. +.LP +The benchmarks are available via ftp from: +.IP +.I "http://ftp.bitmover.com/lmbench" +.NH 1 +Acknowledgments +.LP +Many people have provided invaluable help and insight into the benchmarks. +We especially thank: +Eric Anderson \s-1(HP)\s0, +Bruce Chapman \s-1(SUN)\s0, +Larry McVoy \s-1(BitMover)\s0, +David Mosberger \s-1(HP)\s0, +Wayne Scott \s-1(BitMover)\s0, +John Wilkes \s-1(HP)\s0, +and +Mitch Wright \s-1(HP)\s0. +.LP +We would also like to thank all of the people that have run the +benchmark and contributed their results; none of this would have been possible +without their assistance. +.LP +Our thanks to +all of the free software community for tools that were used during this +project. +.\" .R1 +.\" bibliography references-lmbench3 +.\" .R2 +.\"******************************************************************** +.\" Redefine the IP paragraph format so it won't insert a useless line +.\" break when the paragraph tag is longer than the indent distance +.\" +.de @IP +.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) +.par*start \\n[\\n[.ev]:ai] 0 +.if !'\\$1'' \{\ +. \" Divert the label so as to freeze any spaces. +. di par*label +. in 0 +. nf +\&\\$1 +. di +. in +. fi +. chop par*label +. ti -\\n[\\n[.ev]:ai]u +. ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c +. el \{\ +\\*[par*label] +.\". br +. \} +. rm par*label +.\} +.. +.\"******************************************************************** +.\" redefine the way the reference tag is printed so it is enclosed in +.\" square brackets +.\" +.de ref*end-print +.ie d [F .IP "[\\*([F]" 2 +.el .XP +\\*[ref*string] +.. +.\"******************************************************************** +.\" Get journal number entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-N +.ref*field N "" ( ) +.. +.\"******************************************************************** +.\" Get journal volume entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-V +.ref*field V , "" "" "" +.. +.\"******************************************************************** +.\" Get the date entry right. Should not be enclosed in parentheses. +.\" +.de ref*add-D +.ref*field D "," +.. +.\" References +.[ +$LIST$ +.] +.\" .so bios diff --git a/performance/lmbench3/doc/lmbench3_arch.fig b/performance/lmbench3/doc/lmbench3_arch.fig new file mode 100644 index 0000000..36274db --- /dev/null +++ b/performance/lmbench3/doc/lmbench3_arch.fig @@ -0,0 +1,119 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 900 1425 2100 2400 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 900 1950 2100 1950 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 900 2025 2100 2025 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1350 1950 1350 2100 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1500 1950 1500 2100 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1650 1950 1650 2100 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1800 1950 1800 2100 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1950 1950 1950 2100 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1200 1950 1200 2100 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1050 1950 1050 2100 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 900 1425 2100 1425 2100 2400 900 2400 900 1425 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 900 2100 2100 2100 +4 0 0 50 0 0 12 0.0000 4 135 480 1275 1575 Cache\001 +-6 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 150 525 3450 525 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 300 750 300 525 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 600 750 600 525 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 900 750 900 525 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 150 2625 2250 2625 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 3075 75 3450 75 3450 300 3075 300 3075 75 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 2550 300 2550 525 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 2400 75 2775 75 2775 300 2400 300 2400 75 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 1950 75 2325 75 2325 300 1950 300 1950 75 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 3225 300 3225 525 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 2100 300 2100 525 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 225 75 825 75 825 300 225 300 225 75 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 975 75 1575 75 1575 300 975 300 975 75 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1275 300 1275 525 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 525 300 525 525 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 600 2775 1800 2775 1800 3450 600 3450 600 2775 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1125 2625 1125 2775 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 2775 525 2775 750 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1950 750 1950 525 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 4 + 2775 975 2775 1275 1500 1275 1500 1425 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 2925 1350 3375 1350 3375 1575 2925 1575 2925 1350 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 3 + 3000 900 3150 900 3150 1350 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 3 + 3150 1575 3150 1725 2100 1725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 898 1940 675 1875 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 4 + 2175 1950 2250 1950 2250 2100 2175 2100 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 2475 2025 2250 2025 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1500 2400 1500 2625 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 2625 750 3000 750 3000 975 2625 975 2625 750 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 1875 750 2250 750 2250 975 1875 975 1875 750 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 1125 750 1500 750 1500 975 1125 975 1125 750 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 675 750 1050 750 1050 975 675 975 675 750 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 225 750 600 750 600 975 225 975 225 750 +4 0 0 50 0 0 12 0.0000 4 135 375 300 225 ALU\001 +4 0 0 50 0 0 12 0.0000 4 135 270 75 450 bus\001 +4 0 0 50 0 0 12 0.0000 4 135 150 3150 225 fn\001 +4 0 0 50 0 0 12 0.0000 4 15 135 2850 225 ...\001 +4 0 0 50 0 0 12 0.0000 4 180 1710 1725 450 floating point registers\001 +4 0 0 50 0 0 12 0.0000 4 135 150 2475 225 f1\001 +4 0 0 50 0 0 12 0.0000 4 135 150 2025 225 f0\001 +4 0 0 50 0 0 12 0.0000 4 135 345 1050 225 FPU\001 +4 0 0 50 0 0 12 0.0000 4 135 600 900 2925 memory\001 +4 0 0 50 0 0 12 0.0000 4 135 300 2700 900 MA\001 +4 0 0 50 0 0 12 0.0000 4 180 1500 1350 1275 physical addressing\001 +4 0 0 50 0 0 12 0.0000 4 135 765 150 1875 cache line\001 +4 0 0 50 0 0 12 0.0000 4 180 900 2550 2025 set (2-way)\001 +4 0 0 50 0 0 12 0.0000 4 135 330 3000 1500 TLB\001 +4 0 0 50 0 0 12 0.0000 4 180 915 2325 2625 memory bus\001 +4 0 0 50 0 0 12 0.0000 4 90 150 1950 900 rn\001 +4 0 0 50 0 0 12 0.0000 4 15 135 1575 900 ...\001 +4 0 0 50 0 0 12 0.0000 4 135 150 1200 900 r2\001 +4 0 0 50 0 0 12 0.0000 4 135 150 750 900 r1\001 +4 0 0 50 0 0 12 0.0000 4 135 150 300 900 r0\001 +4 0 0 50 0 0 12 0.0000 4 180 1245 975 675 integer registers\001 diff --git a/performance/lmbench3/doc/lmbench3_signals.fig b/performance/lmbench3/doc/lmbench3_signals.fig new file mode 100644 index 0000000..12e9bb1 --- /dev/null +++ b/performance/lmbench3/doc/lmbench3_signals.fig @@ -0,0 +1,95 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 225 1575 1050 2025 +2 2 2 1 0 0 50 0 10 3.000 0 0 -1 0 0 5 + 225 1800 375 1800 375 1950 225 1950 225 1800 +2 2 2 1 0 0 50 0 3 3.000 0 0 -1 0 0 5 + 225 1575 375 1575 375 1725 225 1725 225 1575 +4 0 0 50 0 0 12 0.0000 4 180 600 450 1725 working\001 +4 0 0 50 0 0 12 0.0000 4 180 465 450 1950 timing\001 +-6 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 2025 300 2025 1725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 2250 300 2250 1575 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 750 525 2250 525 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 750 825 2250 825 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1575 675 750 675 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1800 975 750 975 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 750 1125 2250 1125 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1800 300 1800 1875 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 3000 600 2250 600 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1575 675 3000 675 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 3000 900 2250 900 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 1800 1200 3000 1200 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 3000 1275 2250 1275 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 2025 1350 3000 1350 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1500 750 1650 600 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1950 1425 2100 1275 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 1500 75 2325 75 2325 300 1500 300 1500 75 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 + 1575 300 1575 2025 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 1 1 1.00 60.00 120.00 + 2025 1350 750 1350 +2 2 2 1 0 0 50 0 3 3.000 0 0 -1 0 0 5 + 150 525 750 525 750 975 150 975 150 525 +2 2 2 1 0 0 50 0 10 3.000 0 0 -1 0 0 5 + 150 675 750 675 750 825 150 825 150 675 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 150 225 750 225 750 1350 150 1350 150 225 +2 2 2 1 0 0 50 0 3 3.000 0 0 -1 0 0 5 + 3000 600 3600 600 3600 1200 3000 1200 3000 600 +2 2 2 1 0 0 50 0 10 3.000 0 0 -1 0 0 5 + 3000 675 3600 675 3600 900 3000 900 3000 675 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 3000 225 3600 225 3600 1425 3000 1425 3000 225 +4 0 0 50 0 0 12 0.0000 4 150 480 1650 225 parent\001 +4 0 0 50 0 1 12 0.0000 4 180 420 900 525 ready\001 +4 0 0 50 0 1 12 0.0000 4 135 360 900 825 done\001 +4 0 0 50 0 1 12 0.0000 4 135 495 1200 975 results\001 +4 0 0 50 0 1 12 0.0000 4 180 1020 825 1125 timing results\001 +4 0 0 50 0 0 12 0.0000 4 135 450 3075 375 child1\001 +4 0 0 50 0 1 12 0.0000 4 135 495 2325 1200 results\001 +4 0 0 50 0 1 12 0.0000 4 180 420 2550 600 ready\001 +4 0 0 50 0 1 12 0.0000 4 135 360 2550 900 done\001 +4 0 0 50 0 1 12 0.0000 4 135 165 1350 675 go\001 +4 0 0 50 0 1 12 0.0000 4 135 300 1275 1350 exit\001 +4 0 0 50 0 0 12 0.0000 4 105 360 1650 2025 start\001 +4 0 0 50 0 0 12 0.0000 4 135 690 2325 1575 response\001 +4 0 0 50 0 0 12 0.0000 4 135 285 2100 1725 exit\001 +4 0 0 50 0 0 12 0.0000 4 135 435 1875 1875 result\001 +4 0 0 50 0 0 12 0.0000 4 135 450 225 375 child0\001 diff --git a/performance/lmbench3/doc/lmdd.8 b/performance/lmbench3/doc/lmdd.8 new file mode 100644 index 0000000..fdb888c --- /dev/null +++ b/performance/lmbench3/doc/lmdd.8 @@ -0,0 +1,146 @@ +.\" $Id: lmdd.8 1.1 94/11/18 01:26:35-08:00 lm@xxxxxxxxxxxxxxx $ +.TH LMDD 8 "$Date: 94/11/18 01:26:35-08:00 $" "(c)1994 Larry McVoy" "LMBENCH" +.SH NAME +lmdd \- move io for performance and debugging tests +.SH SYNOPSIS +.B lmdd +[ +.IB option = value +] .\|.\|. +.SH DESCRIPTION +.B lmdd +copies a specified input file to a specified output with possible +conversions. This program is primarily useful for timing I/O since it +prints out the timing statistics after completing. +.SH OPTIONS +.TP 15 +.BI if= name +Input file is taken from +.IR name ; +.I internal +is the default. +.I internal +is a special file that acts like Sun's +.IR /dev/zero , +i.e., it provides a buffer of zeros without doing a system call to get them. +.sp .5 +The following file names are taken to mean the standard input: +.IR - , +.IR 0 , +or +.IR stdin . +.TP +.BI of= name +Output file is taken from +.IR name ; +.I internal +is the default. +.I internal +is a special file that acts like +.IR /dev/null , +without doing a system call to get rid of the data. +.sp .5 +The following file names are taken to mean the standard output: +.IR - , +.IR 1 , +or +.IR stdout . +.sp .5 +The following file names are taken to mean the standard error: +.IR 2 , +or +.IR stderr . +.TP +.BI bs= n +Input and output block size +.I n +bytes (default 8192). Note that this is different from dd(1), it has +a 512 byte default. Also note that the block size can be followed +by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024), +respectively. +.TP +.BI ipat= n +If +.B n +is non zero, expect a known pattern in the file (see opat). Mismatches +will be displayed as "ERROR: off=%d want=%x got=%x". The pattern is +a sequence of 4 byte integers with the first 0, second 1, and so on. +The default is not to check for the pattern. +.TP +.BI opat= n +If +.B n +is non zero, generate a known pattern on the output stream. Used for +debugging file system correctness. +The default is not to generate the pattern. +.TP +.BI mismatch= n +If +.B n +is non zero, stop at the first mismatched value. Used with ipat. +.TP +.BI skip= n +Skip +.IR n "" +input blocks before starting copy. +.TP +.BI fsync= n +If +.I n +is non-zero, call fsync(2) on the output file before exiting or printing +timing statistics. +.TP +.BI sync= n +If +.I n +is non-zero, call sync(2) before exiting or printing +timing statistics. +.TP +.BI rand= n +This argument, by default off, turns on random behavior. The argument is +not a flag, it is a size, that size is used as the upper bound for the +seeks. +Also note that the block size can be followed +by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024), +.TP +.BI flush= n +If +.I n +is non-zero and mmap(2) is available, call msync(2) to invalidate the +output file. This flushes the file to disk so that you don't have +unmount/mount. It is not as good as mount/unmount because it just +flushes file pages - it misses the indirect blocks which are still +cached. Not supported on all systems, compile time option. +.TP +.BI rusage= n +If +.I n +is non-zero, print rusage statistics as well as timing statistics. +Not supported on all systems, compile time option. +.TP +.BI count= n +Copy only +.IR n "" +input records. +.SH EXAMPLES +.LP +This is the most common usage, the intent is to measure disk performance. +The disk is a spare partition mounted on /spare. +.sp +.nf +.in +4 +# mount /spare +# lmdd if=internal of=/spare/XXX count=1000 fsync=1 +7.81 MB in 3.78 seconds (2.0676 MB/sec) + +: Flush cache +# umount /spare +# mount /spare + +# lmdd if=/spare/XXX of=internal +7.81 MB in 2.83 seconds (2.7611 MB/sec) +.in +.sp +.fi +.SH AUTHOR +Larry McVoy, lm@xxxxxxx diff --git a/performance/lmbench3/doc/mem.pic b/performance/lmbench3/doc/mem.pic new file mode 100644 index 0000000..a8b5971 --- /dev/null +++ b/performance/lmbench3/doc/mem.pic @@ -0,0 +1,2337 @@ +.PS +.ps 8 +.vs 11 +.ft CB +[ +# Variables, tweak these. + xtick = 2.000000 # width of an X tick + xlower = 8.000000 # where the xtick start + xupper = 24.000000 # upper range of graph + xn = 8 # number of ticks to do + ytick = 50.000000 # width of an Y tick + ylower = 0.000000 # where the ytick start + yupper = 500.000000 # upper range of graph + yn = 10 # number of ticks to do + xsize = 1.75 # width of the graph + ysize = 1.75 # height of the graph + yscale = ysize / (yupper - ylower) # scale data to paper + xscale = xsize / (xupper - xlower) # scale data to paper + tick = 0.10000000000000000555 # distance towards numbers + gthk = .1 # thickness of grid lines + thk = .75 # thickness of data lines + qthk = 2.0 # thickness of quartile lines + vs = .15 # works for 10 point fonts + +# Draw the graph borders and tick marks + O: box thick 1.5 ht ysize wid xsize + j = ylower + t = tick * .5 + for i = 0 to yn by 1 do { + ys = j - ylower + g = ys * yscale + line thick 1.5 from O.sw + (-tick, g) to O.sw + (0, g) + + if (i < yn) then { + y2 = (ys + (ytick / 2)) * yscale + line thick .5 from O.sw + (-t, y2) to O.sw + (0, y2) + } + if (yupper - ylower > 999) then { + sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02) + } else { if (yupper - ylower > 10) then { + sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02) + } else { if (yupper - ylower > 1) then { + sprintf("%.1f", j) rjust at O.sw + (-.2, g - .02) + } else { + sprintf("%.2f", j) rjust at O.sw + (-.2, g - .02) + }}} + j = j + ytick + } + j = xlower + for i = 0 to xn by 1 do { + xs = j - xlower + g = xs * xscale + line thick 1.5 from O.sw + (g, -tick) to O.sw + (g, 0) + + if (i < xn) then { + x2 = (xs + (xtick / 2)) * xscale + line thick .5 from O.sw + (x2, 0) to O.sw + (x2, -t) + } + if (xupper - xlower > 999) then { + sprintf("%.0f", j) at O.sw + (g, -.25) + } else { if (xupper - xlower > 10) then { + sprintf("%.0f", j) at O.sw + (g, -.25) + } else { if (xupper - xlower > 1) then { + sprintf("%.1f", j) at O.sw + (g, -.25) + } else { + sprintf("%.2f", j) at O.sw + (g, -.25) + }}} + j = j + xtick + } + +# DATASET: stride=8, MARK 0 +[ "\(ci" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (18 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (23 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (27 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (29 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (29 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (29 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (29 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (29 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (31 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (30 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (31 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (31 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (31 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (31 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (31 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (32 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (32 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (33 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (32 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (33 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (34 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (34 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (34 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (35 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (35 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (36 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (36 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (37 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (46 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (40 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (39 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (89 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (91 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (92 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (91 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (92 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (89 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (92 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (92 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (91 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (92 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (92 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (92 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (90 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (92 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (92 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(ci" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (92 - ylower)) + +# DATASET: stride=16, MARK 1 +[ "\(sq" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (26 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (36 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (44 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (49 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (49 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (48 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (49 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (49 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (49 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (49 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (51 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (49 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (51 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (51 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (52 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (51 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (52 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (51 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (52 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (52 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (52 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (52 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (53 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (53 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (57 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (57 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (58 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (59 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (60 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (65 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (67 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (67 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (164 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (165 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (167 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (165 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (165 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (168 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (167 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (166 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (165 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (168 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (168 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (167 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (167 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (166 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (167 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(sq" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (167 - ylower)) + +# DATASET: stride=32, MARK 2 +[ "\(*D" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (28 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (40 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (57 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (57 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (58 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (57 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (57 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (57 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (60 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (61 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (61 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (62 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (61 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (62 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (63 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (65 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (64 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (67 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (68 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (70 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (83 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (85 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (87 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (335 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (335 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (336 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (335 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (339 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (337 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (338 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (336 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (337 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (335 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (338 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (339 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (336 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (340 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(*D" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (340 - ylower)) + +# DATASET: stride=64, MARK 3 +[ "\(mu" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (28 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (40 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (49 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (51 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (57 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (57 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (58 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (58 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (61 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (61 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (61 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (62 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (63 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (63 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (76 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (78 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (79 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (323 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (326 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (325 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (326 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (326 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (328 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (326 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (326 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(mu" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (327 - ylower)) + +# DATASET: stride=128, MARK 4 +[ "\s+4\(bu\s0" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (27 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (59 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (60 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (60 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (60 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (61 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (61 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (71 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (75 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (75 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (317 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (322 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (320 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (320 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (320 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (319 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (320 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (320 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (323 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\s+4\(bu\s0" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (321 - ylower)) + +# DATASET: stride=512, MARK 5 +[ box ht .07 wid .07 fill 1 ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (28 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (53 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (50 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (67 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (67 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (67 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (67 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (67 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (67 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (77 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (74 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (80 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (317 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (320 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (320 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (320 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (320 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (322 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (321 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (322 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (322 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (322 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box ht .07 wid .07 fill 1 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (322 - ylower)) + +# DATASET: stride=1024, MARK 6 +[ "\s+2\(pl\s0" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (27 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (53 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (78 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (78 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (78 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (78 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (78 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (78 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (88 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (91 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (91 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (324 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (325 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (326 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (328 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (326 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (326 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (328 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (327 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (326 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\s+2\(pl\s0" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (326 - ylower)) + +# DATASET: stride=2048, MARK 7 +[ "\s+4\(**\s0" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (27 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (40 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (100 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (100 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (100 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (100 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (100 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (100 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (111 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (115 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (114 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (340 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (340 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (343 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (344 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (343 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (343 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (345 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (343 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (344 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (344 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (344 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (344 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (345 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (345 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (345 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\s+4\(**\s0" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (345 - ylower)) + +# DATASET: stride=4096, MARK 0 +[ "\(ci" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (39 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (56 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (147 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (146 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (146 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (145 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (145 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (145 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (157 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (162 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (160 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (379 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (380 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (378 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (380 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (382 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (381 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (381 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (381 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (382 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (382 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (382 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (382 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (383 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (383 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (385 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(ci" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (385 - ylower)) + +# DATASET: stride=8192, MARK 1 +[ "\(sq" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (232 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (231 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (231 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (232 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (232 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (230 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (240 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (246 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (246 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (445 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (449 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (441 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (449 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (449 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (450 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (451 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (443 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (441 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (442 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (446 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (452 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (452 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (453 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (453 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(sq" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (453 - ylower)) + +# DATASET: stride=16384, MARK 2 +[ "\(*D" ] at O.sw + \ + (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.322574277531574083 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.585116379985436197 - xlower), yscale * (11 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (13.807157053169248684 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.000461588562853166 - xlower), yscale * (10 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.807157053169248684 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.170078880706594049 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.585116379985436197 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.700404205210695352 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.1698737047066885 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.322020424415466522 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.584962500721157852 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.700404205210695352 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (16.906921372774437629 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.700475230197337595 - xlower), yscale * (53 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.169925001442312151 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.32192809488736529 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.459431618637296424 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.584962500721157852 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.70043971814109085 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.807354922057605506 - xlower), yscale * (55 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (18.90689059560851959 - xlower), yscale * (72 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (19 - xlower), yscale * (243 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20 - xlower), yscale * (432 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (20.584962500721154299 - xlower), yscale * (445 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21 - xlower), yscale * (445 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.321928094887361738 - xlower), yscale * (447 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.584962500721154299 - xlower), yscale * (448 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (21.807354922057605506 - xlower), yscale * (448 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22 - xlower), yscale * (449 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.169925001442312151 - xlower), yscale * (450 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.321928094887361738 - xlower), yscale * (447 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.459431618637296424 - xlower), yscale * (450 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.584962500721157852 - xlower), yscale * (449 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.70043971814109085 - xlower), yscale * (449 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.807354922057605506 - xlower), yscale * (449 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (22.90689059560851959 - xlower), yscale * (452 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ box invis ht .05 wid .05 ] at O.sw + \ + (xscale * (23 - xlower), yscale * (452 - ylower)) +line thick thk from 2nd last [].c to last [].c +[ "\(*D" ] at O.sw + \ + (xscale * (23 - xlower), yscale * (452 - ylower)) + +# DATASET: stride=16384, MARK 3 + +# DATASET: stride=16384, MARK 4 + +.ps 8 +.vs 8 +"8KB" "cache" at O.sw + .35,.32 +arrow thick 2 wid .07 down .15 from O.sw + .35,.20 +".5MB" "cache" at O.sw + .85,.50 +arrow thick 2 wid .07 down .15 from O.sw + .85,.38 +"Main" "mem" at O.e - .25,.15 +arrow thick 2 wid .07 up .15 from O.e - .25,0 +.vs +.ps + +# Xaxis title. +"\s+2log2(Array size)\s0" rjust at O.se - (0, .6) + +# Yaxis title (Latency in nanoseconds) +.ps +2 +.vs -1 +"L" "a" "t" "e" "n" "c" "y" " " "i" "n" at O.w - (.95, 0) +"n" "a" "n" "o" "s" "e" "c" "o" "n" "d" "s" at O.w - (.75, 0) +.ps +.vs + +# Graph title. +"\s+2DEC alpha@182mhz memory latencies\s0" at O.n + (-.5, .3) + +# Title. +#[ "\(ci" ] at O.ne + (.25, - 0 * vs) +#"stride=8" ljust at last [].e + (.1, 0) +#[ "\(sq" ] at O.ne + (.25, - 1 * vs) +#"stride=16" ljust at last [].e + (.1, 0) +#[ "\(*D" ] at O.ne + (.25, - 2 * vs) +#"stride=32" ljust at last [].e + (.1, 0) +#[ "\(mu" ] at O.ne + (.25, - 3 * vs) +#"stride=64" ljust at last [].e + (.1, 0) +#[ "\s+4\(bu\s0" ] at O.ne + (.25, - 4 * vs) +#"stride=128" ljust at last [].e + (.1, 0) +#[ box ht .07 wid .07 fill 1 ] at O.ne + (.25, - 5 * vs) +#"stride=512" ljust at last [].e + (.1, 0) +#[ "\s+2\(pl\s0" ] at O.ne + (.25, - 6 * vs) +#"stride=1024" ljust at last [].e + (.1, 0) +#[ "\s+4\(**\s0" ] at O.ne + (.25, - 7 * vs) +#"stride=2048" ljust at last [].e + (.1, 0) +#[ "\(ci" ] at O.ne + (.25, - 8 * vs) +#"stride=4096" ljust at last [].e + (.1, 0) +#[ "\(sq" ] at O.ne + (.25, - 9 * vs) +#"stride=8192" ljust at last [].e + (.1, 0) +#[ "\(*D" ] at O.ne + (.25, - 10 * vs) +#"stride=16384" ljust at last [].e + (.1, 0) +] +.ft +.ps +.PE diff --git a/performance/lmbench3/doc/memhier-color.d b/performance/lmbench3/doc/memhier-color.d new file mode 100644 index 0000000..50a3cef --- /dev/null +++ b/performance/lmbench3/doc/memhier-color.d @@ -0,0 +1,86 @@ +frame invis ht 1.5 wid 2.5 left solid bot solid +label top "\fBalpha Linux 2.2.16-3\fR" down 0.3 +label bot "Size (MB)" +label left "Latency (ns)" +coord log log +ticks bottom out at 0.000512 "512", 0.001024 "", 0.002048 "", 0.004096 "", 0.008192 "8K", 0.016384 "", 0.032768 "", 0.065536 "", 0.098304 "96K", 0.131072 "", 0.262144 "", 0.524288 "", 1.048576 "1M", 2.097152 "", 4.194304 "", 8.388608 "", 16.777216 "", 33.554432 "32M" +draw dotted + 0.000512 4.042 + 0.008192 4.046 + 0.010240 8.873 + 0.012288 12.085 + 0.016384 16.097 + 0.032768 16.103 + 0.065536 19.908 + 0.098304 20.622 + 0.114688 29.808 + 0.131072 37.724 + 0.196608 47.561 + 0.262144 52.134 + 0.524288 66.410 + 1.048576 74.897 + 1.310720 153.075 + 1.572864 198.678 + 2.097152 264.935 + 3.145728 333.862 + 4.194304 366.109 + 8.388608 370.522 + 33.554432 370.682 +"Colored" ljust at 1.572864, 222.789 +draw solid + 0.000512 4.042 + 0.000640 4.043 + 0.000768 4.044 + 0.000896 4.043 + 0.001024 4.043 + 0.001280 4.044 + 0.001536 4.044 + 0.001792 4.044 + 0.002048 4.041 + 0.002560 4.044 + 0.003072 4.045 + 0.003584 4.044 + 0.004096 4.045 + 0.005120 4.046 + 0.006144 4.047 + 0.007168 4.048 + 0.008192 4.048 + 0.010240 8.872 + 0.012288 12.079 + 0.014336 14.379 + 0.016384 16.097 + 0.020480 16.104 + 0.024576 16.117 + 0.028672 16.114 + 0.032768 16.106 + 0.040960 16.110 + 0.049152 16.123 + 0.057344 18.062 + 0.065536 19.179 + 0.081920 97.039 + 0.098304 84.011 + 0.114688 81.764 + 0.131072 79.122 + 0.163840 82.634 + 0.196608 108.550 + 0.229376 104.530 + 0.262144 119.771 + 0.327680 111.317 + 0.393216 131.057 + 0.458752 143.902 + 0.524288 173.323 + 0.655360 197.268 + 0.786432 219.736 + 0.917504 224.743 + 1.048576 249.878 + 1.310720 287.157 + 1.572864 302.857 + 1.835008 315.170 + 2.097152 329.874 + 2.621440 347.418 + 3.145728 357.183 + 3.670016 362.297 + 4.194304 365.720 + 5.242880 369.345 + 33.554432 370.296 +"Malloc'ed" rjust at 0.458752, 219.736 diff --git a/performance/lmbench3/doc/memhier-line.d b/performance/lmbench3/doc/memhier-line.d new file mode 100644 index 0000000..4bb890e --- /dev/null +++ b/performance/lmbench3/doc/memhier-line.d @@ -0,0 +1,34 @@ +frame invis ht 1.5 wid 2.5 left solid bot solid +label top "\fBalpha Linux 2.2.16-3\fR" down 0.3 +label bot "Line Size (Bytes)" +label left "Latency (ns)" +coord log log +ticks bottom out from 8 to 512 by *4 +ticks bottom out from 8 to 512 by *2 "" +draw solid +8 7.247 +16 10.909 +32 16.788 +64 17.083 +128 16.272 +256 16.721 +512 16.129 +"L1" rjust above at 512, 16.129 +draw solid +8 22.853 +16 41.496 +32 78.712 +64 141.658 +128 139.119 +256 138.446 +512 137.902 +"L2" rjust above at 512, 137.902 +draw solid +8 51.529 +16 98.915 +32 193.614 +64 372.230 +128 371.689 +256 371.486 +512 371.486 +"L3" rjust above at 512, 371.486 diff --git a/performance/lmbench3/doc/memhier-tlb.d b/performance/lmbench3/doc/memhier-tlb.d new file mode 100644 index 0000000..908e840 --- /dev/null +++ b/performance/lmbench3/doc/memhier-tlb.d @@ -0,0 +1,407 @@ +frame invis ht 1.5 wid 2.5 left solid bot solid +label top "\fBalpha Linux 2.2.16-3\fR" down 0.3 +label bot "Pages" +label left "Latency (ns)" +coord log log +draw dotted +1 4.042 +2 4.047 +3 4.043 +4 4.044 +5 4.043 +6 4.043 +7 4.045 +8 4.044 +9 4.044 +10 4.044 +11 4.044 +12 4.044 +13 4.044 +14 4.044 +15 4.045 +16 4.046 +17 4.047 +18 4.046 +19 4.046 +20 4.047 +21 4.048 +22 4.046 +23 4.047 +24 4.047 +25 4.048 +26 4.048 +27 4.048 +28 4.048 +29 4.048 +30 4.049 +31 4.048 +32 4.049 +33 4.049 +34 4.049 +35 4.049 +36 4.049 +37 4.049 +38 4.049 +39 4.071 +40 4.070 +41 4.070 +42 4.070 +43 4.070 +44 4.070 +45 4.070 +46 4.069 +47 4.070 +48 4.070 +49 4.071 +50 4.070 +51 4.070 +52 4.069 +53 4.048 +54 4.049 +55 4.069 +56 4.049 +57 4.049 +58 4.070 +59 4.048 +60 4.050 +61 4.070 +62 4.050 +63 4.048 +64 4.066 +65 4.048 +66 4.050 +67 4.069 +68 4.048 +69 4.049 +70 4.069 +71 4.049 +72 4.049 +73 4.069 +74 4.071 +75 4.071 +76 4.069 +77 4.071 +78 4.071 +79 4.069 +80 4.069 +81 4.069 +82 4.069 +83 4.069 +84 4.070 +85 4.070 +86 4.069 +87 4.070 +88 4.070 +89 4.071 +90 4.071 +91 4.070 +92 4.070 +93 4.072 +94 4.070 +95 4.049 +96 4.049 +97 4.070 +98 4.049 +99 4.050 +100 4.071 +101 4.050 +102 4.048 +103 4.049 +104 4.048 +105 4.048 +106 4.048 +107 4.049 +108 4.048 +109 4.048 +110 4.048 +111 4.048 +112 4.048 +113 4.051 +114 4.048 +115 4.069 +116 4.050 +117 4.048 +118 4.048 +119 4.048 +120 4.054 +121 4.054 +122 4.048 +123 4.050 +124 4.049 +125 4.048 +126 4.049 +127 4.048 +128 4.049 +129 4.260 +130 4.446 +131 4.647 +132 4.802 +133 4.978 +134 5.148 +135 5.321 +136 5.490 +137 5.653 +138 5.816 +139 5.980 +140 6.138 +141 7.370 +256 7.068 +"Packed" rjust above at 246, 7.370 +draw solid +1 4.042 +2 4.042 +3 4.042 +4 4.042 +5 4.042 +6 4.043 +7 4.042 +8 4.042 +9 4.042 +10 4.042 +11 4.042 +12 4.042 +13 4.043 +14 4.042 +15 4.041 +16 4.042 +17 4.043 +18 4.042 +19 4.042 +20 4.043 +21 4.043 +22 4.046 +23 4.044 +24 4.043 +25 4.043 +26 4.044 +27 4.042 +28 4.041 +29 4.044 +30 4.043 +31 4.044 +32 4.044 +33 4.044 +34 4.044 +35 4.044 +36 4.045 +37 4.044 +38 4.044 +39 4.044 +40 4.042 +41 4.043 +42 4.042 +43 4.044 +44 4.044 +45 4.044 +46 4.045 +47 4.044 +48 4.051 +49 4.044 +50 4.044 +51 4.043 +52 4.042 +53 4.045 +54 4.044 +55 4.042 +56 4.044 +57 4.049 +58 4.046 +59 4.045 +60 4.045 +61 4.045 +62 4.047 +63 4.045 +64 39.263 +65 39.209 +66 39.163 +67 39.488 +68 39.473 +69 39.752 +70 39.710 +71 39.651 +72 39.605 +73 39.606 +74 39.522 +75 47.264 +76 39.490 +77 40.007 +78 39.945 +79 39.900 +80 39.891 +81 47.525 +82 39.819 +83 40.051 +84 39.993 +85 40.556 +86 40.487 +87 40.470 +88 40.396 +89 40.623 +90 40.565 +91 40.497 +92 41.640 +93 53.333 +94 40.866 +95 40.823 +96 46.649 +97 40.723 +98 40.739 +99 40.896 +100 40.826 +101 41.257 +102 41.462 +103 41.192 +104 41.150 +105 41.309 +106 41.267 +107 41.471 +108 46.722 +109 41.819 +110 41.742 +111 46.823 +112 41.691 +113 41.592 +114 41.554 +115 41.736 +116 41.712 +117 46.795 +118 43.811 +119 41.940 +120 52.439 +121 42.053 +122 42.025 +123 43.049 +124 42.302 +125 42.431 +126 42.403 +127 42.346 +128 42.496 +129 43.304 +130 42.394 +131 42.591 +132 43.344 +133 46.852 +134 43.398 +135 47.048 +136 43.622 +137 46.991 +138 42.750 +139 42.892 +140 43.915 +141 47.368 +142 52.607 +143 46.635 +144 43.154 +145 43.198 +146 43.866 +147 43.205 +148 47.229 +149 44.179 +150 47.845 +151 44.228 +152 45.044 +153 47.489 +154 44.559 +155 52.694 +156 44.713 +157 48.325 +158 43.963 +159 47.580 +160 53.114 +161 48.816 +162 48.765 +163 46.131 +164 49.539 +165 51.761 +166 48.149 +167 49.600 +168 44.871 +169 49.938 +170 47.790 +171 47.698 +172 48.453 +173 45.148 +174 55.011 +175 45.250 +176 45.917 +177 51.219 +178 48.819 +179 45.335 +180 48.083 +181 58.405 +182 48.727 +183 46.855 +184 46.712 +185 54.348 +186 46.814 +187 48.785 +188 49.653 +189 51.982 +190 51.728 +191 46.027 +192 52.139 +193 53.446 +194 46.605 +195 52.417 +196 52.008 +197 47.167 +198 50.892 +199 54.935 +200 46.870 +201 48.752 +202 46.438 +203 50.100 +204 48.546 +205 49.406 +206 48.250 +207 48.192 +208 49.371 +209 50.398 +210 52.615 +211 49.973 +212 58.927 +213 51.122 +214 47.716 +215 51.216 +216 53.270 +217 49.865 +218 50.324 +219 49.916 +220 49.336 +221 56.814 +222 50.417 +223 50.910 +224 55.038 +225 61.760 +226 53.135 +227 53.262 +228 50.561 +229 48.315 +230 49.193 +231 53.704 +232 53.386 +233 61.107 +234 49.641 +235 49.387 +236 51.842 +237 52.700 +238 49.340 +239 52.748 +240 57.290 +241 49.655 +242 50.643 +243 52.568 +244 52.457 +245 54.264 +246 59.484 +247 52.176 +248 52.697 +249 63.909 +250 56.820 +251 52.252 +252 62.305 +253 51.512 +254 54.730 +255 51.264 +256 52.391 +"Word/Page" rjust at 80, 52.391 diff --git a/performance/lmbench3/doc/memhier.ms b/performance/lmbench3/doc/memhier.ms new file mode 100644 index 0000000..cd81c2b --- /dev/null +++ b/performance/lmbench3/doc/memhier.ms @@ -0,0 +1,1576 @@ +.\" This document is GNU groff -mgs -t -p -R -s +.\" It will not print with normal troffs, it uses groff features, in particular, +.\" long names for registers & strings. +.\" Deal with it and use groff - it makes things portable. +.\" +.\" $X$ xroff -mgs -t -p -R -s $file +.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more +.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr +.VARPS +.\" Define a page top that looks cool +.\" HELLO CARL! To turn this off, s/PT/oldPT/ +.de PT +.tl '\fBDRAFT\fP'\\*(DY'\fBDRAFT\fP' +.. +.de lmPT +.if \\n%>1 \{\ +. sp -.1i +. ps 14 +. ft 3 +. nr big 24 +. nr space \\w'XXX' +. nr titlewid \\w'\\*[title]' +. nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 +. ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' +. ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 +. ce 1 +\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] +. ps +. sp -.70 +. ps 12 +\\l'\\n[LL]u' +. ft +. ps +.\} +.. +.\" Define a page bottom that looks cool +.\" HELLO CARL! To turn this off, s/BT/oldBT/ +.de BT +.tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP' +.. +.de lmBT +. ps 9 +\v'-1'\\l'\\n(LLu' +. sp -1 +. tl '\(co 2001 \\*[author]'\\*(DY'%' +. ps +.. +.de SP +. if t .sp .5 +. if n .sp 1 +.. +.de BU +. SP +. ne 2 +\(bu\ +. if \\n[.$] \fB\\$1\fP\\$2 +.. +.nr FIGURE 0 +.nr TABLE 0 +.nr SMALL .25i +.de TSTART +. KF +. if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 +. ps -1 +. vs -1 +.. +.de TEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr TABLE \\n[TABLE]+1 +. ce 1 +\fBTable \\n[TABLE].\ \ \\$1\fP +. SP +. KE +.. +.de FEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr FIGURE \\n[FIGURE]+1 +. ce 1 +\fBFigure \\n[FIGURE].\ \ \\$1\fP +. SP +. KE +.. +.\" Configuration +.nr PI 3n +.nr HM 1i +.nr FM 1i +.nr PO 1i +.if t .po 1i +.nr LL 6.5i +.if n .nr PO 0i +.if n .nr LL 7.5i +.nr PS 10 +.nr VS \n(PS+1 +.ds title Micro-architecture analysis +.ds author Carl Staelin +.ds lmbench \f(CWlmbench\fP +.ds lmbench1 \f(CWlmbench1\fP +.ds lmbench2 \f(CWlmbench2\fP +.ds lmbench3 \f(CWlmbench3\fP +.ds bcopy \f(CWbcopy\fP +.ds connect \f(CWconnect\fP +.ds execlp \f(CWexeclp\fP +.ds exit \f(CWexit\fP +.ds fork \f(CWfork\fP +.ds gcc \f(CWgcc\fP +.ds getpid \f(CWgetpid\fP +.ds getpid \f(CWgetpid\fP +.ds gettimeofday \f(CWgettimeofday\fP +.ds kill \f(CWkill\fP +.ds lat_mem_rd \f(CWlat_mem_rd\fP +.ds lat_ops \f(CWlat_ops\fP +.ds lmdd \f(CWlmdd\fP +.ds memmove \f(CWmemmove\fP +.ds mmap \f(CWmmap\fP +.ds par_mem \f(CWpar_mem\fP +.ds par_ops \f(CWpar_ops\fP +.ds popen \f(CWpopen\fP +.ds read \f(CWread\fP +.ds stream \f(CWstream\fP +.ds system \f(CWsystem\fP +.ds uiomove \f(CWuiomove\fP +.ds write \f(CWwrite\fP +.ds yield \f(CWyield\fP +.\" References stuff +.de RN \"Reference Name: .RN $1 -- prints the reference prettily +.\" [\s-2\\$1\s+2]\\$2 +[\s-1\\$1\s0]\\$2 +.. +.\" .R1 +.\" sort A+DT +.\" database references +.\" label-in-text +.\" label A.nD.y-2 +.\" bracket-label \*([. \*(.] ", " +.\" .R2 +.EQ +delim $$ +.EN +.TL +\s(14Micro-architecture analysis\s0 +.AU +\s+2\fR\*[author]\fP\s0 +.AI +\fI\s+2Hewlett-Packard Laboratories Israel\s0\fP +.SP +.AB +\*[lmbench] version 3 includes a number of new micro-benchmarks +that analyze specific aspects of system micro-architecture, +such as instruction level parallelism, the cache hierarchy and TLB. +.LP +There are new benchmarks to measure instruction level +parallelism, such as the effectiveness of overlapped +memory accesses or arithmetic operations. +There are other new benchmarks to measure various +aspects of the architecture, such as the cache line +size(s), TLB size, and latency costs for basic +arithmetic operations. +\*[lmbench] can identify the number of caches, and the size, +line size, and available parallelism for each cache. +It can also measure the effective TLB size. +.AE +.if t .MC 3.05i +.NH 1 +Introduction +.LP +\*[lmbench] version 3 includes a variety of new benchmarks +designed to measure and analyze various aspects of memory +system design and performance. The most important aspect +of memory subsystem performance is typically the memory +hierarchy, the number and size of caches. Other important +aspects include the cache line size, TLB, and memory +parallelism. +.LP +There are any number of aspects of a computer's +micro-architecture that can impact a program's +performance, such as the design of the memory +hierarchy and the basic performance of the various +arithmetic units. +.LP +All of the new benchmarks were added to \*[lmbench] +because the author needed them to help guide his +design decisions in one or more projects over the +last few years. +For example, \*[lat_ops] was added because the +author was trying to decide whether a particular +image processing algorithm should be implemented +using integer or floating point arithmetic. +Floating point arithmetic was preferred for a +variety of reasons, but it was feared that +floating point arithmetic would be prohibitively +expensive compared to integer operations. +By quickly building \*[lat_ops] the author was +able to verify that the floating point performance +should be no worse than integer performance. +.LP +Memory speeds have not kept pace with the dizzying pace +of processor performance improvements. The result has +been a steady increase in the relative cost of memory +accesses, when measured in terms of instructions or +clock ticks. For example, a 2GHz processor with 200ns +memory latency would wait roughly 400 instructions for +a single memory access. +.LP +To alleviate memory bottlenecks, architects use cache +memory to reduce the average memory latency. Typically +there are between one and three caches in modern +memory subsystems. A rule of thumb is that each +step down the memory hierarchy results in at least +a doubling of memory latency and at least a doubling +of the cache size. +.LP +The details of the memory hierarchy design can have +a significant impact on application performance +.RN Whaley98 , +but unfortunaley developers frequently cannot predict +the exact configuration of machines which will run +their software. Additionally, many developers are +even unaware of the architectural details of their +own machines. +.LP +One hope is that by providing a portable ANSI-C +tool, developers may be better informed about the +architectural possibilities provided by their +own machines, and they may develop more efficient +software which can automatically utilize features +of the particular hardware based on information +provided by these utilities. +.LP +For example, +.RN Staelin02c +proposes variations on familiar data structures +which take advantage of the increased memory +parallelism afforded by modern processors to +increase performance as much as 50%. +.LP +Before explaining the various algorithms and +experimental methods for determining various +aspects of the memory hierarchy design, we +first give a short tutorial on memory system +design. Then we describe the basic techniques +used in analyzing the memory hierarchy, and +how they neutralize or measure various +subsystems or features of the memory system. +Finally, we describe in more detail the +specific algorithms used to measure the various +aspects of the memory subsystem. +.NH 1 +Computer Architecture Primer +.LP +A processor architecture is generally defined by its +instruction set, but most computer architectures +incorporate a large number of common building blocks +and concepts, such as registers, arithmetic logic +units, and caches. +.LP +Of necessity, this primer over-simplifies the +many details and variations of specific computer +designs and architectures. For more information, +please see +.RN Hennessy96 . +.TSTART 1 +.so lmbench3_arch.pic +.FEND "Architecture diagram" 1 +.LP +Figure \n[FIGURE] contains a greatly simplified block diagram +of a computer. Various important elements, such as +the I/O bus and devices, have been left out. The +core of the processor are the registers (r0, ..., rn +and f0, ..., fn) and the arithmetic units (ALU and FPU). +In general, the arithmetic units can access data in +registers ''instantly''. Often data must be explicitly +loaded from memory into a register before it can be +manipulated by the arithmetic units. +.LP +The ALU handles integer arithmetic, such as bit +operations (AND, OR, XOR, NOT, and SHIFT) as +well as ADD, MUL, DIV, and MOD. Sometimes there +is specialized hardware to handle one or more +operations, such as a barrel shifter for SHIFT +or a multiplier, and sometimes there is no +hardware support for certain operations, such +as MUL, DIV, and MOD. +.LP +The FPU handles floating point arithmetic. +Sometimes there are separate FPUs for single +and double precision floating point operations. +.NH 2 +Memory hierarchy +.LP +Nearly all modern, general purpose computers use +virtual memory with phyically addressed caches. +As such, there is typically one or more caches +between the physical memory and the processor, +and virtual-to-physical address translation +occurs between the processor and the top-level +cache. Cache staging and replacement is done +in \fIcache line\fR units, which are typically +several words in length, and caches lower in +the hierarchy sometimes have cache lines which +are larger than those in the higher caches. +.LP +Modern processors usually incorporate at least +an L1 cache on-chip, and some are starting to +also incorporate the L2 cache on-chip. In +addition, most include a translation look-aside +buffer (TLB) on-chip for fast virtual-to-physical +address translation. +.LP +One key element of any cache design is its +replacement strategy. Most caches use either +direct-mapped or set associative caches. In +the first instance any word in physical memory +has exactly one cache line where into which it +may be staged, while set associative caches +allow a given word to be cached into one of a +set of lines. Direct-mapped caches have a +very simple replacement policy: the contents +of the line that is needed is discarded. +Set associative caches usually use LRU or +some variant within each set, so the least +recently used line in the set of possible +cache lines is replaced. The control logic +for direct-mapped caches is much cheaper to +build, but they are generally only as +effective as a set-associative cache half +the size.\** +.FS +See +.RN Hennessy96 +page 396. +.FE +.LP +Another key element of memory hierarchy design +is the management of dirty data; at what point +are writes passed down the memory hierarchy to +lower caches and main memory? The two basic +policies are write-through and write-back. +A write-through policy means that writes are +immediately passed through the cache to the +next level in the hierarchy, so the lower +levels are updated at the same time as the +cache. A write-back policy means that the +cache line is marked as dirty in the cache, +and only when the line is ejected from the +cache is the data passed down the hierarchy. +Write-through policies are often used in +higher (smaller) caches because multi- +processor systems need to keep a coherent +view of memory and the writes are often +propagated to other processors by \fIsnoopy\fR +caches. +.LP +One often overlooked aspect of cache +performance is cache behavior during +writes. Most cache lines contain +several words, and most instructions +only update the line a word at a time. +This means that when the processor +writes a word to a cache line that is +not present, the cache will read the +line from memory before completing the +write operation. For \*[bcopy]-like +operations this means that the overall +memory bandwidth requirement is actually +two reads and one write per copied word, +rather than the expected read and write. +.LP +Most modern processors now include some form +of prefetch in the memory hierarchy. For +the most part these are simple systems that +can recognize fixed strided accesses through +memory, such as might be seen in many array +operations. However, prefetching systems +appear to be growing in complexity and +capability. +.LP +Additionally, modern memory subsystems can +usually support multiple outstanding requests; +the level of parallelism is usually dependent +on the level of the hierarchy being accessed. +Top-level caches can sometimes support as +many as six or eight outstanding requests, +while main memory can usually support two +outstanding requests. Other elements of +the memory hierarchy, such as the TLB, often +have additional limits on the level of +achievable parallelism in practice.\** +.FS +For example, if the TLB serializes all +TLB misses, and if each memory access +causes a TLB miss, then the memory +accesses will be serialized even if +the data was in a cache supporting +six outstanding requests. +.FE +.LP +For more information and details on memory +subsystem design, and computer architecture +in general, please see +.RN Hennessy96 +which has an excellent description of these +and many other issues. +.NH 2 +Some Recent Innovations +.LP +There are a number of modern extensions to computer +architecture that attempt to increase the processor's +ability to do several things at once. Nearly all of +these enhancements are intended to be invisible to +programmers using higher-level languages such as +C or JAVA. +.IP "\fBSuperscalar processors\fR" +Superscalar processors have multiple processing +units which can operate simultaneously. +.IP "\fBDynamic instruction reordering\fR" +Dynamic instruction reordering allows the processor +to execute instructions whose operands are ready +before instructions which are stalled waiting for +memory or other instruction's completion. +.IP "\fBMemory parallelism\fR" +By allowing multiple outstanding memory requests, +processors allow the memory subsystem to service +multiple (independent) requests in parallel. +Since memory accesses are a common performance +bottleneck, this can greatly improve performance. +.IP "\fBVector processing\fR" +Vector processing allows the processor to execute +arithmetic operations on vector operands in +parallel, and in modern commodity processors goes +by names such as MMX, SSE, and 3DNow. +.IP "\fBSimultaneous multi-threading (SMT)\fR" +SMT allows superscalar processors to simulatenously +execute instructions from several threads (contexts) +.RN Tullsen96 . +SMT may include extensions which allow for very +lightweight inter-thread synchronization primitives +that enable much finer-grained thread-level +parallelism than traditional synchronization +methods +.RN Tullsen99 . +.IP "\fBExplicitly parallel instruction computers (EPIC)\fR" +EPIC allows the compiler to explicitly issue $N$ +instructions in parallel at each instruction, which +informs the hardware that these instructions are +independent and may be executed in parallel +.RN Schlansker00 . +It moves much of the burden regarding dependency +checking from the hardware to the compiler. +.NH 1 +Basic operation latency +.LP +\*[lmbench3] includes a new micro-benchmark +which measures the latency for a variety of basic +operations, such as addition, multiplication, and +division of integer, float, and double operands. +To measure the basic operation latency we construct +a basic arithmetic statement containing the desired +operands and operations. This statement is repeated +one hundred times and these repetitions are then +embedded in a loop. +.TSTART +.TS +center box tab (&); +c c c +l & l & l . +Operand&Operation&Statement +_ +int&$bit$&r^=i;s^=r;r|=s; +&$add$&a+=b;b-=a; +&$mul$&r=(r*i)^r; +&$div$&r=(r/i)^r; +&$mod$&r=(r%i)^r; +_ +float&$add$&f+=f; +&$mul$&f*=f; +&$div$&f=g/f; +_ +double&$add$&f+=f; +&$mul$&f*=f; +&$div$&f=g/f; +.TE +.TEND "lat_ops statements" +.LP +Table \n[TABLE] shows the data type and expressions +used for each basic operation type. The variable +$i$ indicates the integer loop variable and generally +changes every ten or hundred evaluations of the +basic expression. All other variables are of +the basic type being measured, and aside from +being modified by the relevant expressions are +only initialized once at the beginning of the +benchmark routine. +.LP +Each statement has been designed to ensure that +the statement instances are \fIinterlocked\fR, +namely that the processor cannot begin processing +the next instance of the statement until it has +completed processing the previous instance. This +property is crucial to the correct measurement of +operation latency. +.LP +One important consideration in the design of +the statements was that they not be optimized +out of the loop by intelligent compilers. +Since the statements are repeated one hundred +times, the compiler has the option of evaluating +the sequence of one hundred repetitions of the +same statement, and sometimes it can find +optimizations that are not immediately +apparent. For example, the integer statement +$a=a+a;$ when repeated one hundred times in +a loop can be replaced with the single statement +$a=0;$ because the statement $a=a+a;$ is equivalent +to $a< < =1;$, and one hundred repetitions of that +statement is equivalent to $a< < =100;$, which for +32bit (or even 64bit) integers is equivalent to +$a=0;$. +.LP +It is relatively easy to identify floating +point statements that interlock, are not +optimized away, and that only use the operation +of interest. +It is much harder to identify integer statements +meeting the same criterion. All simple +integer bitwise operations can either be optimized +away, don't interlock, or use operations other +than one of interest. +We chose to add operations other than the +operation(s) of interest to the statements. +.LP +The integer $mul$, $div$, and $mod$ statements all +include an added $xor$ operation which prevents +(current) compilers from optimizing the statements +away. Since the $xor$ operation is generally +completed in a single clock tick, and since +we can measure the $xor$ operation latency +separately and subtract that overhead, we can +still measure the latencies of the other +operations of interest. +.LP +It is not possible to measure latency for 64bit +operations on 32bit machines because most +implementations allow operations on the upper +and lower bits to overlap. This means that +on most 32bit machines, the measured latency +would appear to be a non-integral multiple of +the basic clock cycle. For example, in the +$add$ statement, the system could first add +the two lower words. Then, in parallel it +could both add the two upper words (along with +the carry from the lower words), and compute +the $xor$ of the lower word. Finally, it +can overlap the $xor$ of the upper word +with the addition of the two lower words from +the next instantiation of the statement. +.TSTART +.TS +center box tab (&); +c c c c c +c c c c c +l & l & r & r & r . +Operand&Op&HPPA2.0&PIII&AMD +&&400MHz&667MHz&1.3GHz +_ +mhz&&2.50&1.50&0.75 +int&$bit$&2.53&1.50&0.75 +&$add$&2.50&1.51&0.75 +&$mul$&14.52&6.07&3.03 +&$div$&109.40&58.52&30.86 +&$mod$&75.14&65.01&32.59 +_ +float&$add$&7.54&4.58&3.0 +&$mul$&7.50&7.50&3.0 +&$div$&45.00&35.26&13.21 +_ +double&$add$&7.52&4.53&3.01 +&$mul$&7.52&7.71&3.01 +&$div$&85.01&35.51&13.16 +.TE +.TEND "lat_ops results (ns)" +.LP +Table \n[TABLE] contains some sample results +for two processors. +It does contain one result which is slightly +surprising unless you are familiar with the +PA-RISC architecture: floating point multiply +and divide are faster than the corresponding +integer operations! This is because PA-RISC +does not contain integer MUL, DIV, or MOD +instructions and the optimizing compiler +converts the integers into floating point, +does the operations in the floating point +unit, and then converts the result back +to an integer. +.NH 2 +Basic operation parallelism +.LP +Instruction-level parallelism in commodity processors +has become commonplace in the last ten years. +Modern processors typically have more than one +operational unit that can be active during a +given clock cycle, such as an integer arithmetic +unit and a floating point unit. In addition, +processors may have more than a single instance +of a given type of operational unit, both of +which may be active at a given time. All this +intra-processor parallelism is used to try and +reduce the average number of clock cycles per +executed instruction. +.LP +\*[lmbench3] incorporates a new benchmark \*[par_ops] +which attempts to quantify the level of available +instruction-level parallelism provided by the processor. This +benchmark is very similar to \*[lat_ops], and +in fact uses the same statement kernels, but it +has been modified and extended. We create +different versions of each benchmark; each +version has $N$ sets of interleaved statements. +Each set is identical to equivalent \*[lat_ops] +statements. In this way multiple independent +sets can be executing the same operation(s) +in parallel, if the hardware supports it. +.LP +For example, the float $mul$ benchmark to measure +performance with two parallel streams of statements +would look like something this: +.DS L +\f(CW#define TEN(a) a a a a a a a a a a +void benchmark_1(iter_t iterations, void* cookie) +{ + register iter_t i = iterations; + struct _state* state = (struct _state*)cookie; + register float f0 = state->float_data[0]; + register float f1 = state->float_data[1]; + + while (i-- > 0) { + TEN(f0*=f0; f1*=f1;) + } + use_int((int)f0); + use_int((int)f1); +}\fP +.DE +.LP +If the processor had two floating point multiply +units, then both $f0$ and $f1$ multiplies could +proceed in parallel. +.LP +However, there are some potential problems with +the integer operations, namely the fact that the +statements contain mixed operations. In general, +processors have at least as many integer units +that can do $xor$ as can do the other operations +of interest ($mul$, $div$ and $mod$), so the +inclusion of $xor$ in the statements shouldn't +be a bottleneck. +.LP +However, since parallelism is measured by comparing +the latency of the single-stream with that of +multiple interleaved streams, and since the single-stream +latency includes the $xor$ latency, the apparent +parallelism of $mul$, $div$, $mod$ can be over-stated. +For example, if a process has one unit that can +do integer bit operations, such as $xor$, and another +unit for integer $mul$ operations, then the average +latency for $a0 = (i * a0) ^ a0$ in the single stream +case would be: +.EQ +t bar = t sub xor + t sub mul +.EN +In the multi-stream case, the execution of the $xor$ +operation of one stream can be overlapped with the +$mul$ of another stream, so the average latency per +stream would simply be $t bar = t sub mul$, assuming +that $mul$ operations are not cheaper than $xor$ +operations, which results in an apparent parallelism +$p tilde$: +.EQ +p tilde = {t sub xor + t sub mul} over { t sub mul } +.EN +Assuming that $t sub xor < < t sub mul$, this +still gives a reasonable approximation to +the correct answer. Unfortunately, this is +not always a reasonable assumption. +.LP +Of course, if it was known ahead of time that +$xor$ and { $mul$, $div$, and $mod$ } used +different execution units, then the benchmark +could simply subtract $t sub xor$ from the +baseline measurement. The difficulty lies +in determining whether the units overlap +or not. +.TSTART +.TS +center box tab (&); +c c c c c +c c c c c +l & l & r & r & r . +Operand&Op&HPPA2.0&PIII&AMD +&&400MHz&667MHz&1.3GHz +_ +int&$bit$&1.99&1.70&1.87 +&$add$&1.99&1.61&1.90 +&$mul$&6.64&3.81&2.00 +&$div$&2.81&1.20&1.00 +&$mod$&2.78&1.11&1.03 +_ +float&$add$&5.88&1.00&2.66 +&$mul$&5.86&1.14&2.47 +&$div$&2.12&1.03&1.14 +_ +double&$add$&5.68&1.08&2.49 +&$mul$&5.58&1.00&2.53 +&$div$&2.19&1.03&1.14 +.TE +.TEND "par_ops results" +.LP +.NH 1 +Memory analysis +.LP +There are a variety of aspects of memory hierarchy design +that are interesting to a software developer, such as +the number of caches and their sizes. In addition, other +aspects of cache design, such as the line size, +associativity and parallelism can impact software +performance and are of potential interest to software +developers. +.LP +The problem is designing a portable ANSI-C program to +infer the cache parameters. A number of operating +systems have hooks to report at least certain aspects +of cache and memory hierarchy design, but any program +utilizing those hooks would not be fully portable +across hardware and operating system platforms. +.LP +The key observation is that caches help reduce memory +latency. In a perfect world, all possible data would +fit in the cache, so a graph of average memory latency +versus amount of memory utilized would look like a +series of plateaus separated by cliffs. The cliff +edges would be located at the cache boundaries and +the plateau height would be the average memory latency. +.LP +The first problem is that one needs a mechanism for +accurately measuring time in a portable fashion. +\*[lmbench2] introduced a new timing harness +that determines the minimum duration of a timing interval +for \*[gettimeofday] to provide accurate measurements +.RN Staelin98 . +.LP +\*[lmbench] includes a benchmark that measures +average memory latency, \*[lat_mem_rd] +.RN McVoy96 . +It creates a pointer chain, and then measures the +average time to dereference the pointers. +\*[lat_mem_rd] creates the pointer chain by simply +striding through memory at fixed intervals, e.g. +every other word. +.LP +\*[lmbench2] extended \*[lat_mem_rd] so +that each timing interval only accessed memory +as many times as necessary to consume a timing +interval. When accessing cache this often means +that the whole pointer chain will be accessed +at least once during the timing interval, but +when accessing memory this often means that only +a portion of the chain will be accessed during +any given timing interval. +.LP +While this approach gives very useful insights +into memory hierarchy performance, it is not +quite sufficient to determine the various +characteristics of the memory hierarchy. +.LP +The first problem is that unless the stride is +exactly the same size as the cache line size, then +there will either be multiple successive accesses +to the same line, or some fraction of data +will be completely skipped. In the first case +the observed latency is much faster than the +true latency because it is the average of a +single miss latency (slow) with one or more +hit latencies (fast). In the second case, the +amount of data actually loaded into the cache +may be a small fraction of the expected amount +so the data may fit into a smaller (faster) +cache. +The second problem is that this sequence is +highly predictable, even by simple-minded +prefetching policies, so accurate prefetching +might be masking the true memory latencies. +.LP +This method does do a few things properly. +First of all, accesses to a single page are +clustered together so the TLB miss cost (if +any) is amortized over as many accesses as +possible. Secondly, assuming the pointer +chain is laid out unpredictably, the memory +subsystem must wait for the previous load +to complete before it can initiate the +next load, so we can measure the true latency. +.NH 2 +Prefetching +.LP +Some memory subsystems have been highly optimized to +recognize and automatically prefetch memory when +given "predictable" memory access streams, such as +when striding through array accesses. This means that +the memory access stream generated by \*[lmbench] +must be unpredictable by the standard prediction +algorithms. +.LP +The original \*[lmbench] memory latency benchmark, +lat_mem_rd, built a chain of pointers that would +stride backwards through memory. This was able to +defeat many simple prefetching algorithms of the +time, but some systems came to incorporate prefetching +algorithms that recognized strided accesses in +both directions. +.LP +The obvious method for producing an unpredictable +chain of line references is to use a random +permutation of line indexes. +.LP +\*[lmbench] uses a deterministic algorithm to compute +the reference chain which guarantees that references +are as far away from previous accesses in both time +and space as possible. Basically, the binary bits +representing the line index are reversed, so that +1101 becomes 1011, or 001 becomes 100. This only +works if the number of cache lines is an even power +of two, but since page sizes and line sizes are +always powers of two, this assumption is valid.\** +.FS +At least this is the case in every modern system known +to the author. +.FE +.LP +Additionally, since higher-level caches can have +smaller line sizes than lower-level caches, it +is necessary to access every word in the relevant +chunk of memory. However, accesses to words in +the same line must be separated in time by accesses +to the rest of the memory. This is achieved by +identifying the line size for the largest cache, +and then setting up the chain so that there is +one pass through the memory for each word in the +line with the sequence of words being determined +by the bit-reversal method described above. +.LP +For example, suppose a system has 4KB pages, the +largest cache has a line size of 64bytes, and a +word is 4bytes. Then each page would have 64 lines, +and each line would have 16 words. The system +would setup a pointer chain that visits each line +on each page using the zeroth word; at the end of +the chain it would then jump to the start of the +pages and visit each line on each page using the +eigth word, and so forth until each word had been +visited. +.NH 2 +Dirty data +.LP +An additional issue that we need to take into +account is the cache's policy for dirty data. +Many caches use a copy-back policy, while others +use a write-through policy. +.LP +Different caches on the same machine may use +different policies. Also, cache performance +can be affected by the presence of dirty data. +For example, suppose both the L1 and L2 caches +use a copy-back policy, and suppose that the +access time for reading data located in L2 +depends on whether the data being ejected from +L1 is dirty and needs to be copied back from L1 +to L2 before the read from L2 to L1. +In this case, a benchmark which writes a pointer +chain that fits in L2 but is larger than L1, +and then measures the time to follow the chain, +will get a different average memory latency than +a benchmark which writes the same chain and +reads enough data to flush the L2 cache before +measuring the time to follow the chain. +In the first case, each application read will +result in a write from L1 to L2 followed by +a read from L2 to L1, while in the second +case each application read will only result +in a read from L2 to L1. +.LP +Since it is possible that average memory latencies +for a read-only access stream may be increased if +any of the data in the cache is dirty, we need to +flush the cache after setting up the pointer +chains and before we do any measurements. +Otherwise, when we access a pointer chain that +is larger than the L1 cache but smaller than the +largest cache, dirty data can reside in the lowest +(largest) cache and as each line is staged from +the largest cache to the L1 cache, it is marked +as dirty in the L1 cache. Then when each dirty +line is flushed from the L1 cache (to the L2 +cache), the system has to write the data back to +L2, which delays the load of the next (dirty) +line from L2 to L1. +.LP +To flush the cache we read (and sum) a large +amount of memory, which should be several times +larger than the largest cache. In this way, +all dirty data in the cache should be flushed +from the cache without creating additional +dirty data. +.NH 2 +Page mapping +.LP +Complicating the issue still further is the fact that +caches do not use full LRU replacement policies. Nearly +all caches use some form of set associativity, where +pages are directed to a pool of cache lines based on +the physical address. Replacement within the pool is +typically LRU. Direct-mapped caches are a special case +where the pool size is a single line. +.LP +Additionally, some systems use victim caches, which are +typically small caches which caches recently discarded +cache lines. Victim caches can be particularly effective +for direct-mapped caches by reducing the cache miss +rate caused by colliding hot spots. +.LP +However, page mapping and its attendant cache collisions +is under the control of the kernel, and is in fact +invisible to user-land programs. Some operating +systems make an effort to minimize possible page collisions +when giving memory to processes\**, while other operating +systems appear to simply grab the first available pages, +regardless of potential cache collision effects. +.FS +This is generally known as "page coloring", and is much +more important on systems with direct-mapped caches than +those with N-way set associative caches. +.FE +.LP +Factoring out page placement affects on average memory +latency is very difficult, but it is necessary to +ensure that the correct cache size is identified. +.NH 1 +Cache line size +.LP +The first feature of the memory hierarchy we +will try to analyze is the cache line size, +since we can find the line size for the +largest cache without any other knowledge of +the system, and since determining nearly all +other aspects of the memory subsystem either +require or are greatly simplified by knowing +the cache line size. +.LP +The most obvious aspect of cache design is that replacement +is done on a per-line basis, and cache lines often contain +several words of data (32-128bytes per line is common). +However, it is necessary to ensure that we don't +generate "spurious" cache hits by referencing a word from +a cache line that was recently accessed. We must ensure +that each line is only re-referenced after all other +memory in the buffer has been referenced. +.LP +Unfortunately, we usually do not know the cache line size +ahead of time. In addition, sometimes systems contain +several caches, and each cache can use a different line +size! Usually line sizes are powers of two, and usually +the smaller (higher) caches have line sizes which are the +same or smaller than the larger (lower) caches. However, +we still need to ensure that we access all cache lines +for all caches without generating the spurious cache hits. +.LP +Determining the cache line size requires a series of +experiments. The basic observation is that when the +amount of memory being accessed is larger than the +cache, and when the access chain is arranged properly, +then each memory reference causes a cache miss. If +however, a word on a recently access line is requested, +then that reference will be a cache hit. More +completely, the average memory access time $t bar$ +is: +.EQ +t bar = t sub miss + ( n - 1 ) t sub hit +.EN +expressed as a function of $n$, the number of accesses +to the cache line, $t sub miss$, the cache miss latency, +and $t sub hit$, the cache hit latency. +.TSTART +.G1 +.so memhier-line.d +.G2 +.FEND "Line Size" +.LP +We can determine the cache line size by measuring +the average memory access latency over a series of +memory access patterns: accessing every word, every +other word, every fourth word, every eighth word, ... +While the system is accessing multiple words per +cache line, the average memory latency will be +smaller than the cache miss latency, and as the +space between accesses increases, the average +memory increase will grow. +When the system accesses only one word per line, +the average memory latency will remain level even +as the spacing between accesses increases. +.LP +It is possible to utilize this behavior to identify +the cache line size. The algorithm is to measure +the average memory latency when each word is +accessed. Then as you increase the space between +accessed words (doubling the space each iteration), +you look for a situation where the average latency +increased dramatically, say greater than 30%, +followed by a levelling off on the next iteration, +say an increase less than 15%. The line size is +the last point where the average latency jumped +dramatically. +.NH 1 +TLB +.LP +Measuring the TLB-miss costs assumes that one can isolate +those costs from the rest of the memory access costs. The +key observation is that it is often possible to create a +situation in which all data being accessed resides in the +cache, and yet it requires a TLB-miss to be able to locate +it. +.LP +This program identifies the effective TLB size, rather +than the true TLB size. First of all, from a programmer's +point of view, it is really the effective TLB size that +impacts program performance. Secondly, there is no way +for a user-land program to measure true TLB size because +kernels sometimes pin some kernel page mappings into the +TLB and because some hardware/OS combinations +support "super-pages", or multi-page mappings. +.LP +We create two similar pointer chains with identical length +and which reference an identical amount of memory, with one +key difference. In the first chain, the data is packed +tightly into as few pages as possible, and references +remain within a single page as long as possible. The +second chain spreads the data over as many pages as +possible and jumps between pages at each reference. +The two chains are arranged so that the same amount of +data will fit into the cache, so that the raw memory +access time for each chain is identical, within +experimental constraints. The sole difference between +average access costs should be the TLB-lookup times. +.LP +When the pages from the second chain fit into the TLB, +the average access times for the two chains should be +identical. However, as soon as the number of pages in +the second chain exceeds the TLB size, the second +chain will start to pay frequent TLB-miss costs. +Depending on the TLB replacement policy, the fraction of +requests generating TLB-misses in the second chain can vary +dramatically\**. +.FS +Pure LRU would ensure that as soon as the chain was one +page longer than the TLB size, every access would trigger +a TLB-miss. However, other replacement algorithms might +result in as few as $"number of pages" - "TLB size" + 1$ +misses per iteration over the loop. +.FE +.TSTART +.G1 +.so memhier-tlb.d +.G2 +.FEND "TLB" +.LP +The system must search for the point at which the +average memory latency of the second chain diverges +from the average latency of the first chain. Since +most systems have relatively small TLBs and since +checking TLB sizes smaller than the effective TLB +size is faster than checking TLB sizes larger than +the TLB, the system starts with the guess of eight +pages to establish a baseline. It then iteratively +doubles the number of pages until either a maximum +limit has been reached or the average TLB-miss cost +is greater than 15% of the average memory latency. +Once it discovers the upper bound on the possible +TLB size, it uses a binary search between the last +two TLB size guesses to find the point at which +the average latency for the two streams diverge. +.NH 1 +Cache size +.LP +For the purpose of identifying the cache size, the +ideal situation is that as long as the amount of +memory is equal to or less than the cache size, then +all the data is in the cache and the average memory +latency is the cache hit latency. As soon as the +memory doesn't fit in cache, then none of it should +be in the cache, so the average memory latency is +the cache miss latency.\** When examining average +memory latency versus memory size, this would give +nice flat plateaus for each cache, with nice sharp +transitions from one cache to the next, and from the +largest cache to main memory. +.FS +Of course, for real programs, you want the average +memory latency to be as low as possible, which means +that you want as much of the data in cache as possible. +.FE +.LP +However, the realities are that real data from real +systems is corrupted in a variety of ways. +First of all, even when the memory can fit into the +cache, pages often collide in the cache and the +fraction of pages that have collisions often +increases as the amount of memory nears the cache size. +Secondly, even when the memory cannot fit into the +cache, there can be pages that do not collide. +Finally, there is simple experimental noise, which is +usually limited to 1% or less. +.LP +The result of the first two problems is that on +some systems, the average memory latency increases +gradually as the memory size is increased. There +are no flat plateaus and sharp cliffs which make +it easy to identify the number, size, and +performance of the caches. +.NH 2 +Page coloring +.LP +The first problem is to create a set of pages +which do not collide in the cache. +The solution is to allocate more memory +than necessary, and to try different combinations +of pages to find the page set with the fastest +average memory latency. Unfortunately, the obvious +algorithm is exponential in the number of pages. +.TSTART +.G1 +.so memhier-color.d +.G2 +.FEND "Page Coloring Effects" +.LP +One observation is that cache misses are usually +much more expensive than cache hits. So, one +possibility is to choose a random set of pages +as the baseline and measure the average memory +latency. Then iterate over the pages, removing +that page from the set and measuring the average +memory latency of the reduced set. If that page +collides with another page, then the average +memory latency for the reduced set should be smaller +than the average latency for the whole set. +.LP +Once a page that collides has been identified, then +the system can iterate through available pages, +try adding them to the reduced set and measuring +the average memory latency. If the page doesn't +collide with any pages in the reduced set, then +the average memory latency should drop still further. +In this way, the system could identify all +colliding pages and replace them with pages +that don't collide (assuming the memory all +fits in the cache). +.LP +There are a number of problems with this simple approach. +First of all, it would take a very long time to run due +to the large, but polynomial, number of experiments required. +Secondly, as the memory size increases and the +number of pages involved gets large, the effect +of a single page on the average memory latency +can reach the level of experimental noise. +.LP +This approach makes the assumption that physical +page locations do not change once the memory +has been allocated. In most systems, this +assumption is valid unless the memory is paged +to disk. However, at least IRIX includes an +operating system configuration option to allow +the operating system to dynamically relocate +pages in memory. This capability is disabled +by default, so its use is relatively uncommon. +It is possible that page relocation will become +more common in the future, in which case this +design may need to be revisited in the future. +.LP +Our algorithm uses this basic approach, but +attempts to reduce the number of experiments +required by removing chunks of pages at a time. +It will remove up to 5% of pages at a time +and see if the average memory latency decreases +significantly, in which case it examines the +chunk a page at a time to find the page or +pages which probably conflict. +.LP +An additional problem is that for large caches, +the measured difference between two sets of +pages with just one page collision difference +can be very hard to measure. For example, +on a system with a 512Kbyte L2 cache and 4Kbyte +pages, the cache can hold 128 pages. Assuming +that a cache miss is 200ns, a cache hit is 50ns, +and 123 pages have no collisions but 5 pages +collide, then the average memory latency is +.EQ +t bar = { 123 times 50 + 5 times 200 } over 128 +.EN +or 55.85ns. Suppose we remove one page and +replace it with another page which doesn't +collide, so we now have 4 collisions and +124 pages without collisions, then the +average memory latency is 54.68ns. The +difference is generally significant even +in the face of experimental noise, but for +larger caches the differences may recede +into the background noise. +.LP +As caches increase in size, the problems +associated with detecting page collisions +can only increase. +For example, an 8MB cache on a system with +4KB pages would contain 2,048 pages. +Removing a single page collision, even when +the resulting memory latency for that page +reduces by a factor of four, would simply +result in an overall reduction in average +memory latency of less than 0.2%, which is +smaller than the average experimental measurement +errors. +.LP +Additionally, as caches increase in size, +effects such as cache consumption by the +page table can begin to become important. +.LP +The single largest remaining problem in our +system is that this algorithm does not +guarantee that we find a set of pages +which do not contain any collisions in all +cases that it \fImight\fR find such a set. +It merely does so \fImost\fR of the time +with (relatively) few measurements. +.LP +One possible means of dealing with this +problem is to try an remove sets of pages +in the hope that enough pages from a set +of colliding pages will be removed at +once, so that the remaining pages from +that collision set won't collide anymore. +Suppose you have a 4-way set associative +cache, and that you have six pages that +collide. If you remove two of the pages, +then the remaining four pages don't collide +anymore either. This means that by +removing two pages we have removed six +collisions, which should be easier to +detect. +.LP +XXX Look into randomizing the pages +after each iteration of the top-level +loop to make this sort of serendipitious +event more likely. +.NH 2 +Measurement +.LP +In order to reduce the number of memory sizes +that are measured by the system, we use a +binary search on memory sizes to find "edges" +in the memory latency. +We make the simplifying assumption that cache +sizes are either a power of two, or 1.5 times +a power of two. In our experience, this assumption +has been true. +We also assume that no cache is smaller than +512 bytes. +.LP +We explore the memory space at intervals +equivalent to the most recent power of two +divided by four. So, starting at one +megabyte we would (potentially) measure +memory latency at 1MB, 1.25MB, 1.5MB, and +1.75MB. This allows us to detect +cache sizes at the desired intervals, since +the measurement at the exact cache size +can often be corrupted by other system +activity so the next smaller measurement +should still be valid. +.LP +XXX If the measurement size increment is +several times larger than a page, then +perhaps we should actually measure the +system with a couple pages less than the +stated size? +This would allow us some "slop" for +collisions and might make it easier near +cache boundaries to get accurate +measurements. +The "slop" should probably be some fraction +of the measurement increment size, such as +10%, so it scales properly. +.LP +Since we start with a maximum size as a given, +and we use 512 bytes as a minimum, and we can +compute the full set of possible measurements, +and initialize an array with the desired sizes. +We can then use a modified binary search on +this array to efficiently locate cache edges +while still (potentially) leaving large, flat +plateaus unexplored between the end points. +.LP +Finally, we assume that true memory latency +is monotonically increasing with the amount +of memory that you access. +This means that if the measured latency ever +decreases as you increase the amount of +accessed memory, then the previous measurement +must have been an error and the value is +replaced by the smaller measurement. +.NH 2 +Data analysis +.LP +Assuming the data collected by the system +were noise-free and that the experimental +system had managed to eliminate all artifacts +such as page coloring effects, then the +next problem is to analyze the data to find +the number and size of the caches. +Basically this means examining the data to +find plateaus and cliffs. +Each plateau would represent a cache, and the +cliff represents the edge (size) of the cache. +.LP +Of course, real data is never perfect, and +there are any number of issues which can +affect the experimental results, so the +analysis methodology must be robust to noise. +.LP +XXX describe analysis methodology here +.NH 1 +Cache associativity +.LP +No modern caches are fully associative, meaning that +no caches use LRU replacement, because the performance +overhead for LRU is so severe. Most caches are +either set associative or direct mapped, meaning +that data from a given location can only go to +one of a small number of cache lines, and in the +case of a direct-mapped cache to a single cache line. +.LP +To determine the cache associativity we need to find +a set of pages which have no page collisions and +which (just) fit into the cache. We then need to +locate a page which collides with these pages and +append it to the set. +Then we can iterate through the pages in the initial +page set, removing a page at a time, and comparing +the resulting average memory latency with that of +the full set. +When the average memory latency drops significantly, +then we know that this page conflicts with the +full page set, and since the page set only has one +conflict, we know it conflicts with the newly +introduced page. +The number of pages that conflict with this newly +introduced page is the set associativity. +.LP +There is a potential bug in this algorithm +for systems with victim caches! +If the victim cache can hold at least a page +of data, then this algorithm cannot properly +determine the cache associativity because the +victim cache will play the role of additional +associative cache lines. +.LP +For smaller caches there is the additional +problem that the cache associativity may not +be smaller than the number of pages that the +cache may hold. +In which case, this simple approach will +never find pages that collide in the cache. +The solution to this problem is to increase +the line size and the number of pages so that +only portions of each page are accessed, and +there can be enough pages to create collisions. +.NH 1 +Memory parallelism +.LP +With the increasing memory bottleneck, most modern +systems allow multiple outstanding memory references. +On many systems, the effective parallelism depends +on which part of the memory hierarchy is being +accessed. For example, L1 caches can often service +as many as six or eight outstanding requests, while main +memory systems can usually support at most two +outstanding requests. +.LP +To measure the available parallelism for a given +chunk of memory, the system sets up a pointer +chain running through the memory exactly the same +as if it were to measure the average memory +latency. It then uses fifteen different access +routines, one for each possible level of parallelism.\** +.FS +The assumption here is that no memory subsystem +supports more than sixteen accesses in parallel. +.FE +Each routine dereferences $N$ pointers in parallel. +For example, the inner loop of the routine where +$N=2$ would look something like this: +.DS L +\f(CWwhile (iterations-- > 0) { + p0 = (char**)*p0; + p1 = (char**)*p1; +}\fP +.DE +.LP +The available parallelism is the maximum speedup +over all N compared to the sequential case. +.LP +Note that this value is often not integral because +many factors go into the effective parallelism, +such as TLB contention, can limit the effective +parallelism. +.NH 1 +DRAM pages +.LP +Within DRAM chips there is usually one or more +lines of data which is "cached" in registers +near the chip outputs. +Accessing data contained in these lines is typically +faster than accessing data from the body of the DRAM +chip. +The set of memory contained in a bank of DRAM chips +for a single line (per DRAM chip) of memory is usually +called a DRAM page. +.LP +Recently some systems have started taking advantage +of this potential performance increase by keeping +DRAM pages "open" (in the register bank) after an +access in the hope that the next access will be +to the same page. +This means that main memory latency suddenly +depends on the access history, and that dramatic +differences in "open" versus "closed" DRAM page +performance may impact software and data structure +design. +.LP +To measure DRAM page latency, we need to compare +performance for accesses to "open" versus "closed" +DRAM pages. +The standard pointer chain developed for measuring +cache and memory latency maximizes "open" DRAM page +accesses while minimizing other overheads, such as +TLB misses. +This means that we need to develop another pointer +chain which maximizes "closed" DRAM accesses while +still minimizing other overheads such as TLB misses. +.LP +This can be done by clustering pages into \fIgroups\fP +whose size is smaller than the TLB size. +Within each group the pointer chain switches pages +on each access to maximize the probability of a "closed" +DRAM page access. +For all but the last page in the group, each access +points to the same location within the page, except +on the next page. +The last page points to the next location in the +first page, using the same location bit-switching +selection logic used in the standard pointer chain. +.NH 1 +Conclusion +.LP +XXX Update conclusions +\*[lmbench] is a useful, portable micro-benchmark suite designed to +measure important aspects of system performance. We have found that a good +memory subsystem is at least as important as the processor speed. +As processors get faster and faster, more and more of the system design +effort will need to move to the cache and memory subsystems. +.NH 1 +Acknowledgments +.LP +Many people have provided invaluable help and insight into both the +benchmarks themselves and the paper. The \s-1USENIX\s0 reviewers +were especially helpful. +We thank all of them and especially thank: +Wayne Scott \s-1(BitMover)\s0, +Larry McVoy \s-1(BitMover)\s0, +Bruce Chapman \s-1(SUN)\s0, +and +John McCalpin \s-1(Univ. of Virginia)\s0. +.LP +We would also like to thank all of the people that have +run the benchmark and contributed their results; none of +this would have been possible without their assistance. +.NH 1 +Obtaining the benchmarks +.LP +The benchmarks are available at: +.QP +\fIhttp://ftp.bitmover.com/lmbench\fP +.ft +.\" .R1 +.\" bibliography references-memhier +.\" .R2 +.\"******************************************************************** +.\" Redefine the IP paragraph format so it won't insert a useless line +.\" break when the paragraph tag is longer than the indent distance +.\" +.de @IP +.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) +.par*start \\n[\\n[.ev]:ai] 0 +.if !'\\$1'' \{\ +. \" Divert the label so as to freeze any spaces. +. di par*label +. in 0 +. nf +\&\\$1 +. di +. in +. fi +. chop par*label +. ti -\\n[\\n[.ev]:ai]u +. ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c +. el \{\ +\\*[par*label] +.\". br +. \} +. rm par*label +.\} +.. +.\"******************************************************************** +.\" redefine the way the reference tag is printed so it is enclosed in +.\" square brackets +.\" +.de ref*end-print +.ie d [F .IP "[\\*([F]" 2 +.el .XP +\\*[ref*string] +.. +.\"******************************************************************** +.\" Get journal number entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-N +.ref*field N "" ( ) +.. +.\"******************************************************************** +.\" Get journal volume entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-V +.ref*field V , "" "" "" +.. +.\"******************************************************************** +.\" Get the date entry right. Should not be enclosed in parentheses. +.\" +.de ref*add-D +.ref*field D "," +.. +.R1 +accumulate +sort A+DT +database references-memhier +label-in-text +label A.nD.y-2 +bracket-label [ ] ", " +bibliography references-memhier +.R2 +.\" .so bios diff --git a/performance/lmbench3/doc/mhz.8 b/performance/lmbench3/doc/mhz.8 new file mode 100644 index 0000000..b9cd1b7 --- /dev/null +++ b/performance/lmbench3/doc/mhz.8 @@ -0,0 +1,29 @@ +.\" $Id: mhz.8 1.3 00/10/16 17:13:52+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $ +.TH MHZ 8 "$Date: 00/10/16 17:13:52+02:00 $" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +mhz \- calulate processor clock rate +.SH SYNOPSIS +.B mhz +.I [-c] +.SH DESCRIPTION +.B mhz +calculates the processor clock rate and megahertz. It uses an +unrolled, interlocked loop of adds or shifts. So far, superscalarness +has been defeated on the tested processors (SuperSPARC, RIOS, Alpha). +.SH OUTPUT +Output format is either just the clock rate as a float (-c) or more verbose +.sp +.ft CB +39.80 Mhz, 25 nanosec clock +.ft +.LP +.B mhz +is described more completely in ``mhz: Anatomy of a microbenchmark'' +in +.I "Proceedings of 1998 USENIX Annual Technical Conference", June 1998. +.SH "SEE ALSO" +lmbench(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/par_mem.8 b/performance/lmbench3/doc/par_mem.8 new file mode 100644 index 0000000..0844f55 --- /dev/null +++ b/performance/lmbench3/doc/par_mem.8 @@ -0,0 +1,68 @@ +.\" $Id$ +.TH PAR_MEM 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +par_mem \- memory parallelism benchmark +.SH SYNOPSIS +.B par_mem +[ +.I "-L <line size>" +] +[ +.I "-M <len>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B par_mem +measures the available parallelism in the memory hierarchy, up to +.I len +bytes. Modern processors can often service multiple memory requests +in parallel, while older processors typically blocked on LOAD +instructions and had no available parallelism (other than that +provided by cache prefetching). +.B par_mem +measures the available parallelism at a variety of points, since the +available parallelism is often a function of the data location in the +memory hierarchy. +.LP +In order to measure the available parallelism +.B par_mem +conducts a variety of experiments at each memory size; one for each +level of parallelism. It builds a pointer chain of the desired +length. It then creates an array of pointers which point to chain +entries which are evenly spaced across the chain. Then it starts +running the pointers forward through the chain in parallel. It can +then measure the average memory latency for each level of parallelism, +and the available parallelism is the minimum average memory latency +for parallelism 1 divided by the average memory latency across all +levels of available parallelism. +.LP +For example, the inner loop which measures parallelism 2 would look +something like: +.sp +.ft CB + p0 = (char **)*p0; + p1 = (char **)*p1; +.ft +.sp +in a +.I for +loop (the over head of the +.I for +loop is not significant; the loop is an unrolled loop 100 loads long). +.SH OUTPUT +Output format is intended as input to \fBxgraph\fP or some similar program +(we use a perl script that produces pic input). +There is a set of data produced for each stride. The data set title +is the stride size and the data points are the array size in megabytes +(floating point value) and the load latency over all points in that array. +.SH "SEE ALSO" +lmbench(8), line(8), cache(8), tlb(8), par_ops(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/par_ops.8 b/performance/lmbench3/doc/par_ops.8 new file mode 100644 index 0000000..8327162 --- /dev/null +++ b/performance/lmbench3/doc/par_ops.8 @@ -0,0 +1,39 @@ +.\" $Id$ +.TH PAR_OPS 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +par_ops \- basic CPU operation parallelism +.SH SYNOPSIS +.B par_ops +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B par_ops +measures the available parallelism for basic CPU operations, such as +integer ADD. Results are reported as the average operation latency +divided by the minimum average operation latency across all levels of +parallelism. +.TP +integer bit, add, mul, div, mod operations +maximum parallelism for integer XOR, ADD, MUL, DIV, MOD operations. +.TP +uint64 bit, add, mul, div, mod operations +maximum parallelism for uint64 XOR, ADD, MUL, DIV, MOD operations. +.TP +float add, mul, div operations +maximum parallelism for flot ADD, MUL, DIV operations. +.TP +double add, mul, div operations +maximum parallelism for flot ADD, MUL, DIV operations. +.SH BUGS +This benchmark is highly experimental and may sometimes (frequently?) +give erroneous results. +.SH "SEE ALSO" +lmbench(8), lat_ops(8), par_mem(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/parallel.ms b/performance/lmbench3/doc/parallel.ms new file mode 100755 index 0000000..b906446 --- /dev/null +++ b/performance/lmbench3/doc/parallel.ms @@ -0,0 +1,385 @@ +.\" This document is GNU groff -mgs -t -p -R -s +.\" It will not print with normal troffs, it uses groff features, in particular, +.\" long names for registers & strings. +.\" Deal with it and use groff - it makes things portable. +.\" +.\" $X$ xroff -mgs -t -p -R -s $file +.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more +.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr +.VARPS +.\" Define a page top that looks cool +.\" HELLO CARL! To turn this off, s/PT/oldPT/ +.de draftPT +.\" .tl '\fBDRAFT\fP'Printed \\*(DY'\fBDRAFT\fP' +.. +.de lmPT +.if \\n%>1 \{\ +. sp -.1i +. ps 14 +. ft 3 +. nr big 24 +. nr space \\w'XXX' +. nr titlewid \\w'\\*[title]' +. nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 +. ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' +. ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 +. ce 1 +\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] +. ps +. sp -.70 +. ps 12 +\\l'\\n[LL]u' +. ft +. ps +.\} +.. +.\" Define a page bottom that looks cool +.\" HELLO CARL! To turn this off, s/BT/oldBT/ +.de draftBT +.\" .tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP' +.. +.de lmBT +. ps 9 +\v'-1'\\l'\\n(LLu' +. sp -1 +. tl '\(co 2001 \\*[author]'\\*(DY'%' +. ps +.. +.de SP +. if t .sp .5 +. if n .sp 1 +.. +.de BU +. SP +. ne 2 +\(bu\ +. if \\n[.$] \fB\\$1\fP\\$2 +.. +.nr FIGURE 0 +.nr TABLE 0 +.nr SMALL .25i +.de TSTART +. KF +. if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 +. ps -1 +. vs -1 +.. +.de TEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr TABLE \\n[TABLE]+1 +. ce 1 +\fBTable \\n[TABLE].\ \ \\$1\fP +. SP +. KE +.. +.de FEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr FIGURE \\n[FIGURE]+1 +. ce 1 +\fBFigure \\n[FIGURE].\ \ \\$1\fP +. SP +. KE +.. +.\" Configuration +.nr PI 3n +.nr HM 1i +.nr FM 1i +.nr PO 1i +.if t .po 1i +.nr LL 6.5i +.if n .nr PO 0i +.if n .nr LL 7.5i +.nr PS 10 +.nr VS \n(PS+1 +.ds title Utilizing instruction-level parallelism +.ds author Carl Staelin +.ds lmbench \f(CWlmbench\fP +.ds lmbench3 \f(CWlmbench3\fP +.ds lmdd \f(CWlmdd\fP +.ds bcopy \f(CWbcopy\fP +.ds connect \f(CWconnect\fP +.ds execlp \f(CWexeclp\fP +.ds exit \f(CWexit\fP +.ds fork \f(CWfork\fP +.ds gcc \f(CWgcc\fP +.ds getpid \f(CWgetpid\fP +.ds getpid \f(CWgetpid\fP +.ds gettimeofday \f(CWgettimeofday\fP +.ds kill \f(CWkill\fP +.ds memmove \f(CWmemmove\fP +.ds mmap \f(CWmmap\fP +.ds popen \f(CWpopen\fP +.ds read \f(CWread\fP +.ds stream \f(CWstream\fP +.ds system \f(CWsystem\fP +.ds uiomove \f(CWuiomove\fP +.ds write \f(CWwrite\fP +.ds yield \f(CWyield\fP +.ds select \f(CWselect\fP +.ds lat_ops \f(CWlat_ops\fP +.ds benchmp \f(CWbenchmp\fP +.ds lat_connect \f(CWlat_connect\fP +.\" References stuff +.de RN \"Reference Name: .RN $1 -- prints the reference prettily +.\" [\s-2\\$1\s+2]\\$2 +[\s-1\\$1\s0]\\$2 +.. +.\" .R1 +.\" sort A+DT +.\" database references +.\" label-in-text +.\" label A.nD.y-2 +.\" bracket-label \*([. \*(.] ", " +.\" .R2 +.EQ +delim $$ +.EN +.TL +\s(14Utilizing instruction-level parallelism\s0 +.AU +\s+2\fR\*[author]\fP\s0 +.AI +\fI\s+2Hewlett-Packard Laboratories Israel\s0\fP +.SP +.AB +Modern processors and systems provide a great deal of +parallelism, even for traditional single-threaded +software. +Often this parallelism is hidden, but the potential +performance benefits of restructuring software to allow +the hardware to utilize this parallelism can be striking. +For example, modern memory systems can usually support +at least two outstanding requests to main memory, and +as many as six or eight outstanding requests to cache +memory. Since memory latencies can account for a +significant fraction of many program's runtime, +restructuring data structures and algorithms so +strictly sequential memory accesses can be +parallelized can greatly improve performance. +.AE +.if t .MC 3.05i +.NH 1 +Introduction +.LP +Computer scientists are generally taught some basic computer +architectecture and a set of standard data structures and +algorithms, such as lists, hash tables, and binary search. +These data structures and algorithms are commonly used and +in many programs their handling can consume a significant +fraction of the overal runtime. +However, these data structures and algorithms were +designed over thirty years ago, when most processors had +no parallelism. +.LP +There has been a great deal of work by compiler writers +and computer architects on automatically discovering and +utilizing instruction-level parallelism in existing +software, but relatively little work has been done on +examining data structures and algorithms that can enable +increased instruction-level parallelism. +.LP +There has been a great deal of work focussing on +developing parallel algorithms for multi-processor +machines, with explicit synchronization primitives +such as semaphores and barriers. At this level of +parallelism, the overheads are generally so high +that the parallelism must be fairly coarse-grained, +or else the overhead costs consume any benefits +provided by the parallelism. +.LP +However, instruction-level parallelism is "free"; it +is managed by the hardware and incurs no additional +runtime costs. +The main question is how to structure software algorithms +and data structures to maximize the available parallelism. +.NH 1 +Prior work +.LP +Over the last few years, there has been some work on +improving the performance of critical software in a +architecture-sensitive manner. +.LP +.RN Agarwal96 +describes the design and implementation of a +fast sorting algorithm for superscalar RISC machines. +.LP +The Automatically Tuned Linear Algebra System (ATLAS) +.RN Whaley98 +contains a number of parametrized code generators +for matrix multiply operations, as well as a pluggable +architecture to allow developers to add hardware-specific +modules. +ATLAS then explores the parameter space to find the +optimal parameter settings for the particular system. +.LP +FFTW +.RN Frigo98 +is another project which uses architecture-aware +optimizations. +.NH 1 +Computer architecture primer +.LP +A processor architecture is generally defined by its +instruction set, but most computer architectures +incorporate a large number of common building blocks +and concepts, such as registers, arithmetic logic +units, and caches. +.NH 2 +Traditional architecture +.LP +One view of a traditional architecture might be the +MIX system defined by Knuth in his classic work on +algorithms and data structures +.RN Knuth73 . +While the MIX instruction set and architecture does +not forbid parallelism, there is no explicit parallelism +mentioned in the description. +Consequently, none of the algorithms assumes any +instruction-level parallelism, or is structured to +explicitly utilize such parallelism had it existed. +.LP +The MIX system has a single arithmetic logic unit, +and no floating point unit, so there is no explicit +instruction-level parallelism specified in the +architecture. +.NH 2 +Modern Extensions +.LP +There are a number of modern extensions to computer +architecture that attempt to increase the processor's +ability to do several things at once. Nearly all of +these enhancements, with the notable exception of +the EPIC work, are intended to be invisible to the +average programmer. Most notably, they do not require +changing the instruction set. +.IP "Superscalar processors" +Superscalar processors have multiple processing +units which can operate simultaneously. +.IP "Dynamic instruction reordering" +Dynamic instruction reordering allows the processor +to execute instructions whose operands are ready +before instructions which are stalled waiting for +memory or other instruction's completion. +.IP "Memory parallelism" +By allowing multiple outstanding memory requests, +processors allow the memory subsystem to service +multiple (independent) requests in parallel. +Since memory accesses are a common performance +bottleneck, this can greatly improve performance. +.IP "Vector processing" +Vector processing allows the processor to execute +arithmetic operations on vector operands in +parallel, and in modern commodity processors goes +by names such as MMX, SSE, and 3DNow. +.IP "Simultaneous multi-threading (SMT)" +SMT allows superscalar processors to simulatenously +execute instructions from several threads (contexts) +.RN Tullset96 . +SMT may include extensions which allow for very +lightweight inter-thread synchronization primitives +that enable much finer-grained thread-level +parallelism than traditional synchronization +methods +.RN Tullsen99 . +.IP "Explicitly parallel instruction computers (EPIC)" +EPIC allows the compiler to explicitly issue $N$ +instructions in parallel at each instruction, which +informs the hardware that these instructions are +independent and may be executed in parallel +.RN Schlansker00 . +It moves much of the burden regarding dependency +checking from the hardware to the compiler. +.NH 1 +Conclusion +.LP +With the increasing proliferation of both explicit and +hidden parallelism in processor and memory system +designs, it is becoming important to revisit many data +structures and algorithms to adapt them to the new +hardware environment. +.NH 1 +Acknowledgments +.LP +Many people have provided invaluable help and insight into both the +benchmarks themselves and the paper. We thank all of them +and especially thank Larry McVoy \s-1(BitMover)\s0 for the +lively conversations and discussions regarding benchmarking +and experimental design. +.\" .R1 +.\" bibliography references-parallel +.\" .R2 +.\"******************************************************************** +.\" Redefine the IP paragraph format so it won't insert a useless line +.\" break when the paragraph tag is longer than the indent distance +.\" +.de @IP +.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) +.par*start \\n[\\n[.ev]:ai] 0 +.if !'\\$1'' \{\ +. \" Divert the label so as to freeze any spaces. +. di par*label +. in 0 +. nf +\&\\$1 +. di +. in +. fi +. chop par*label +. ti -\\n[\\n[.ev]:ai]u +. ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c +. el \{\ +\\*[par*label] +.\". br +. \} +. rm par*label +.\} +.. +.\"******************************************************************** +.\" redefine the way the reference tag is printed so it is enclosed in +.\" square brackets +.\" +.de ref*end-print +.ie d [F .IP "[\\*([F]" 2 +.el .XP +\\*[ref*string] +.. +.\"******************************************************************** +.\" Get journal number entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-N +.ref*field N "" ( ) +.. +.\"******************************************************************** +.\" Get journal volume entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-V +.ref*field V , "" "" "" +.. +.\"******************************************************************** +.\" Get the date entry right. Should not be enclosed in parentheses. +.\" +.de ref*add-D +.ref*field D "," +.. +.R1 +accumulate +sort A+DT +database references-parallel +label-in-text +label A.nD.y-2 +bracket-label [ ] ", " +bibliography references-parallel +.R2 +.\" .so bios diff --git a/performance/lmbench3/doc/pgraph.1 b/performance/lmbench3/doc/pgraph.1 new file mode 100644 index 0000000..562a58a --- /dev/null +++ b/performance/lmbench3/doc/pgraph.1 @@ -0,0 +1,155 @@ +.\" $Id: pgraph.1 1.3 95/11/29 11:54:39-08:00 lm@xxxxxxxxxxxxxxx $ +.de DS +. sp .5 +. nf +. in +4 +. ft CW +. vs -1 +.. +.de DE +. sp .5 +. fi +. in +. ft +. vs +.. +.TH PGRAPH 1 "Nov, 1995" "lm@xxxxxxx" "Docomentation tools" +.SH NAME +pgraph \- compile graphs into pic input +.SH SYNOPSIS +.B pgraph +[ options ] +[ +.I filename +\&.\|.\|. +] +.SH DESCRIPTION +.LP +.B pgraph +is a perl script which +takes sets of X Y data and generates a (human readable) pic program +that will produce the graphed data. The output is designed such that +you can save it in a file and tweak it to make it fit your document. +Try one and look at the output. The output is actually commented. +.LP +The graph is autosized and auto ticked. +.LP +The input data format is similar +that of xgraph(1), i.e., +.DS +"sloped across +1 1 +2 2 +3 3 + +"straight across +1 4 +2 4 +3 4 +.DE +.SH "CONTROL OPTIONS" +.LP +You may set the graph title, the X title, and the Y title with the +following control sequences in the data stream: +.DS +%T Graph title in +4 point font +%X X axis title and/or units in +2 point font +%Y Y axis title and/or units in +2 point font +%fakemax-X <value> force graph to be that big +%fakemax-Y <value> force graph to be that big +%fakemin-X <value> force graph to be that small +%fakemin-Y <value> force graph to be that small +.DE +.SH OPTIONS +.IP -rev 12 +reverse X/Y data sense (and titles). Note this is done after processing +any fudging of the input data stream(s) (see -xk, -yk, -logx, etc below). +.IP -below +put data set titles below the graph rather than to the right. +.IP -close +no extra space around the data's endpoints. +.IP -qline +connect the quartile center points. +.IP -grid +dotted line grid marks. +.IP -nobox +no box around whole graph. +.IP -big +make the graph take the whole page, and be about 8 inches tall by 7 inches +wide and the title is +8 points. +.IP -slide +make the graph be 4.25 inches square to fit in slides, +in a helvetica bold 10 point font. +.IP -small +make the graph be small, 1.75 inches square, and use an 8 point bold font. +.IP -grapheach +draw each data set in its own graph. +.IP -nolabels +no X/Y/Title labels. +.IP -notitle +no Title label. +.IP -nodatal +no data set labels. +.IP -nomarks +do not mark each data point with distinct markers (endpoints are still +marked). +.IP -k +print values larger than 1000 as value/1000. +.IP -xk +multiply X input by 1024 (blech). +.IP -yk +multiply Y input by 1024 (blech). +.IP -xm +multiply X input by 1024*1024 (blech). +.IP -ym +multiply Y input by 1024*1024 (blech). +.IP -logx +convert X input into log base 2 of X input. +.IP -logy +convert Y input into log base 2 of Y input. +.SH EXAMPLE +Workstation price performance from a Digital ad. Process with +.DS +.ps -2 +graph -rev workstations | groff -TX75 + +%T Workstation Price / Performance, 6/93 +%X SPECINT 92 Performance +%Y Price in $1000's +"Dec AXP line +35 5 +65 10 +78 15 +110 70 + +"Sun SPARC line +25 4 +25 8 +38 16 +48 21 +52 23 +64 27 +.DE +.ps +.SH "QUARTILE FORMAT" +Data points are \f(CBx y1 y2 y3 y4 y5\fP. You get a two lines from the +first two y values, a mark at the third, and another line from the last two. +.SH "SEE ALSO" +.BR gtroff (1), +.BR gpic (1), +.BR perl (1). +.SH BUGS +-grapheach assumes the set of N graphs will fit on one page. +.LP +Since it is just a simple perl script, I tend to be constantly adding +one more feature on the fly. Consult the script for the latest set of +options. Development is typically done by using the closest set of options +to generate the graph, massage the graph to do what you want, then add that +set of changes as a new option. +.LP +This isn't done as much as I would like. +It isn't integrated with the groff preprocessor yet. +It doesn't know about .GS/.GE things. I use it to manually generate +a pic file and then include that. +.LP +I need to include some example data sets with pgraph. diff --git a/performance/lmbench3/doc/rccs.1 b/performance/lmbench3/doc/rccs.1 new file mode 100644 index 0000000..7bbdf52 --- /dev/null +++ b/performance/lmbench3/doc/rccs.1 @@ -0,0 +1,149 @@ +.\" $Id: rccs.1 1.1 95/11/29 12:52:04-08:00 lm@xxxxxxxxxxxxxxx $ +.de DS +. sp .5 +. nf +. in +4 +. ft CW +. vs -1 +.. +.de DE +. sp .5 +. fi +. in +. ft +. vs +.. +.TH RCCS 1 "Nov, 1995" "lm@xxxxxxx" "Programmers tools" +.SH NAME +rccs \- apply RCS commands to sets of files +.SH SYNOPSIS +.B rccs +command +[ options ] +[ +.I filename +and/or +.I directory +\&.\|.\|. +] +.SH DESCRIPTION +.LP +.B rccs +is a perl script that tries to emulate the Berkeley \fBSCCS\fP program +for \fBRCS\fP. If your fingers know how to type commands to \fBSCCS\fP, +just do the same thing to \fBrccs\fP. +.LP +A subset of the \fBSCCS\fP commands are implemented, the ones that I use. +Some new commands have been added. It is easy to add more commands, see +the \fIExample\fP routine at the bottom of \fBrccs\fP to see how. +.LP +This interface does not require a list of files/directories for most +commands; the implied list is *,v and/or RCS/*,v. Destructive commands, +such as clean -f, unedit, unget, do \fBnot\fP have an implied list. In +other words, \f(CBrccs diffs\fP is the same as \f(CBrccs diffs RCS\fP +but \f(CBrccs unedit\fP is not the same as \f(CBrccs unedit RCS\fP. +.SH COMMANDS +.IP options 8 +Note that RCS options are typically passed through to RCS. The options +that made sense to SCCS commands are translated to RCS options. +.IP "ci" 10 +Alias for delta. Checks in files. +.IP "clean [-e] [-f] [-d|y'message'] [files]" +Without any arguments, this command removes all files that are read only +and have an associated RCS file. +With the -e argument, clean removes files that have been checked out +writable but have not been modified. +The -d|y|m option may be combined with -e to check in the set of files that +have been modified. +With the -f option, clean removes all working files, \fBincluding\fP files +that have been modified since the check out. Be careful. +.IP co +Alias for get. Checks out files. +.IP "create [-y|d'message'] [-g] files" +Initial check in of files to the RCS system. The files are then checked out +readonly unless the -g option is present. +The -y or -d options may be used to set the descriptive text message. +Differs from SCCS in that the +original files are not preserved. +.IP deledit +Alias for delta followed by a get -e. +.IP delget +Alias for delta followed by a get. +.IP "delta [-y|d'message'] [-q] [files]" +Check in a delta of the file. -q is changed to RCS' -s and means to be +quiet about hwat is happening. -y'message' or -d'message' or -m'message' +all get sent through to RCS as the check in message. No other arguments +are translated. +.IP "diffs [-C|c] [-r<rev>] [-sdiff] [files]" +Shows changes between the working files and the RCS file. Note that the +files do not need to be checked out, only writable. -C or -c means do a +context diff. -sdiff means do a side by side diff. The sdiff option will +figure out your screen width if it knows how - see the source to make this +work on your system. +.IP edit +Alias for get -e. +.IP enter +Alias for create -g. +.IP fix +Useful if you just checked in the file and then realized you forgot +something. The fix command will remove the top delta from the history +and leave you with an editable working file with the top delta as the +contents. +.IP "get [-e] [-p] [-k] [-s] [files]" +Get, or check out, the file. Without any options, get just gets the +latest revision of the RCS file in the working file. +With -e, check out the file writable. With -p, send the file to stdout. +With -k, supress expansion of key words. With -s, be quiet about what +is happening. +.IP help +Get a brief help screen of information. +.IP "history [files]" +Print the RCS history (my format) of the specified files. +.IP "info [files]" +Print the list of files being edited. +.IP print +Alias for a loop that prints the history of each file followed by the +contents of the file. +.IP prs +Alias for history. +.IP prt +Alias for history. +.IP unedit +Alias for clean -f. +.IP unget +Alias for clean -f. +.SH GLOBAL OPTIONS +.IP -debug 10 +Turn on debugging. Used when debugging \fBrccs\fP itself. +.IP -verbose +Be more verbose about what is happening. +.SH EXAMPLES +To start off, add a bunch of files to RCS: +.DS +rccs create -y'my program name' myprog.c myprog.h +.DE +Now let's edit them all: +.DS +rccs get -e +.DE +If we didn't change anything, the following gives us a clean directory: +.DS +rccs clean -e +.DE +If we changed myprog.h, the following gives us a clean directory after +checking in myprog.h: +.DS +rccs clean -e -d'some message' +.DE +If we want to see what we changed: +.DS +rccs diffs +.DE +.SH "SEE ALSO" +.BR "RCS commands" , +.BR "SCCS commands" , +.BR sdiff (1), +.BR perl (1). +.SH TODO +It would be nice to implement a \fB-i\fP option that prompted before each +action, especially the destructive ones. diff --git a/performance/lmbench3/doc/refdbms.keys b/performance/lmbench3/doc/refdbms.keys new file mode 100644 index 0000000..ff4ab8d --- /dev/null +++ b/performance/lmbench3/doc/refdbms.keys @@ -0,0 +1,20 @@ +Chen93d +Chen94a +Fenwick95 +Howard88 +Jain91 +McCalpin95 +Ousterhout90 +Park90 +Smith82b +Smith85 +Wolman89 +Wong88 +Agarwal95 +Bailey93 +Bitton83 +Chen91b +Dietrich92 +Leutenegger93 +Nelson89 +TPPC92 diff --git a/performance/lmbench3/doc/references b/performance/lmbench3/doc/references new file mode 100644 index 0000000..03167aa --- /dev/null +++ b/performance/lmbench3/doc/references @@ -0,0 +1,186 @@ +%z Article +%K Saavedra95 +%A R.H. Saavedra +%A A.J. Smith +%T Measuring cache and TLB performance and their effect on benchmark runtimes +%J IEEE Transactions on Computers +%V 44 +%N 10 +%D October 1995 +%P 1223-1235 + +%z Article +%K Wolman89 +%A Barry L. Wolman +%A Thomas M. Olson +%T IOBENCH: a system independent IO benchmark +%J Computer Architecture News +%V 17 +%N 5 +%D September 1989 +%P 55-70 +%x IOBENCH is an operating system and processor independent synthetic +%x input/output (IO) benchmark designed to put a configurable IO and +%x processor (CP) load on the system under test. This paper discusses +%x the UNIX versions. +%k IOBENCH, synthetic I/O benchmark, UNIX workload +%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991) + +%z Book +%K Hennessy96 +%A John L. Hennessy +%A David A. Patterson +%T Computer Architecture A Quantitative Approach, 2nd Edition +%I Morgan Kaufman +%D 1996 + +%z Article +%K Chen94a +%A P. M. Chen +%A D. A. Patterson +%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance +%D November 1994 +%J Transactions on Computer Systems +%V 12 +%N 4 +%P 308-339 +%x Current I/O benchmarks suffer from several chronic problems: they +%x quickly become obsolete; they do not stress the I/O system; and they +%x do not help much in undelsi;anding I/O system performance. We +%x propose a new approach to I/O performance analysis. First, we +%x propose a self-scaling benchmark that dynamically adjusts aspects of +%x its workload according to the performance characteristic of the +%x system being measured. By doing so, the benchmark automatically +%x scales across current and future systems. The evaluation aids in +%x understanding system performance by reporting how performance varies +%x according to each of five workload parameters. Second, we propose +%x predicted performance, a technique for using the results from the +%x self-scaling evaluation to estimate quickly the performance for +%x workloads that have not been measured. We show that this technique +%x yields reasonably accurate performance estimates and argue that this +%x method gives a far more accurate comparative performance evaluation +%x than traditional single-point benchmarks. We apply our new +%x evaluation technique by measuring a SPARCstation 1+ with one SCSI +%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running +%x the Sprite LFS operating system with a three-disk disk array, a +%x Convex C240 minisupercomputer with a four-disk disk array, and a +%x Solbourne 5E/905 fileserver with a two-disk disk array. +%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995) +%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995) +%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995) + +%z InProceedings +%K Ousterhout90 +%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990) +%A John K. Ousterhout +%T Why aren't operating systems getting faster as fast as hardware? +%C Proceedings USENIX Summer Conference +%c Anaheim, CA +%D June 1990 +%P 247-256 +%x This paper evaluates several hardware pplatforms and operating systems using +%x a set of benchmarks that stress kernel entry/exit, file systems, and +%x other things related to operating systems. The overall conclusion is that +%x operating system performance is not improving at the same rate as the base speed of the +%x underlying hardware. The most obvious ways to remedy this situation +%x are to improve memory bandwidth and reduce operating systems' +%x tendency to wait for disk operations to complete. +%o Typical performance of 10-20 MIPS cpus is only 0.4 times what +%o their raw hardware performance would suggest. HP-UX is +%o particularly bad on the HP 9000/835, at about 0.2x. (Although +%o this measurement discounted a highly-tuned getpid call.) +%k OS performance, RISC machines, HP9000 Series 835 system calls + +%z InProceedings +%K McVoy91 +%A L. W. McVoy +%A S. R. Kleiman +%T Extent-like Performance from a Unix File System +%C Proceedings USENIX Winter Conference +%c Dallas, TX +%D January 1991 +%P 33-43 + +%z Article +%K Chen93d +%A Peter M. Chen +%A David Patterson +%T Storage performance \- metrics and benchmarks +%J Proceedings of the IEEE +%V 81 +%N 8 +%D August 1993 +%P 1151-1165 +%x Discusses metrics and benchmarks used in storage performance evaluation. +%x Describes, reviews, and runs popular I/O benchmarks on three systems. Also +%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling +%x benchmark with predicted performance. +%k I/O, storage, benchmark, workload, self-scaling benchmark, +%k predicted performance, disk, performance evaluation +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995) + +%z Article +%K Park90a +%A Arvin Park +%A J. C. Becker +%T IOStone: a synthetic file system benchmark +%J Computer Architecture News +%V 18 +%N 2 +%D June 1990 +%P 45-52 +%o this benchmark is useless for all modern systems; it fits +%o completely inside the file system buffer cache. Soon it may even +%o fit inside the processor cache! +%k IOStone, I/O, benchmarks +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995) + +%z Article +%K Fenwick95 +%A David M. Fenwick +%A Denis J. Foley +%A William B. Gist +%A Stephen R. VanDoren +%A Danial Wissell +%T The AlphaServer 8000 series: high-end server platform development +%J Digital Technical Journal +%V 7 +%N 1 +%D August 1995 +%P 43-65 +%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end +%x server products. Both servers are based on the 300Mhz Alpha 21164 +%x microprocessor and on the AlphaServer 8000-series platform architecture. +%x The AlphaServer 8000 platform development team set aggressive system data +%x bandwidth and memory read latency targets in order to achieve high-performance +%x goals. The low-latency criterion was factored into design decisions made at +%x each of the seven layers of platform development. The combination of +%x industry-leading microprocessor technology and a system platform focused +%x on low latency has resulted in a 12-processor server implementation --- +%x the AlphaServer 8400 --- capable of supercomputer levels of performance. +%k DEC Alpha server, performance, memory latency +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995) + +%z Book +%K Toshiba94 +%A Toshiba +%T DRAM Components and Modules +%I Toshiba America Electronic Components, Inc. +%P A59-A77,C37-C42 +%D 1994 + +%z Article +%K McCalpin95 +%A John D. McCalpin +%T Memory bandwidth and machine balance in current high performance computers +%J IEEE Technical Committee on Computer Architecture newsletter +%V to appear +%D December 1995 + +%z Article +%K FSF89 +%A Richard Stallman +%Q Free Software Foundation +%T General Public License +%D 1989 +%O Included with \*[lmbench] diff --git a/performance/lmbench3/doc/references- b/performance/lmbench3/doc/references- new file mode 100644 index 0000000..6f18ced --- /dev/null +++ b/performance/lmbench3/doc/references- @@ -0,0 +1,175 @@ +%z Article +%K Wolman89 +%A Barry L. Wolman +%A Thomas M. Olson +%T IOBENCH: a system independent IO benchmark +%J Computer Architecture News +%V 17 +%N 5 +%D September 1989 +%P 55-70 +%x IOBENCH is an operating system and processor independent synthetic +%x input/output (IO) benchmark designed to put a configurable IO and +%x processor (CP) load on the system under test. This paper discusses +%x the UNIX versions. +%k IOBENCH, synthetic I/O benchmark, UNIX workload +%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991) + +%z Book +%K Hennessy96 +%A John L. Hennessy +%A David A. Patterson +%T Computer Architecture A Quantitative Approach, 2nd Edition +%I Morgan Kaufman +%D 1996 + +%z Article +%K Chen94a +%A P. M. Chen +%A D. A. Patterson +%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance +%D November 1994 +%J Transactions on Computer Systems +%V 12 +%N 4 +%P 308-339 +%x Current I/O benchmarks suffer from several chronic problems: they +%x quickly become obsolete; they do not stress the I/O system; and they +%x do not help much in undelsi;anding I/O system performance. We +%x propose a new approach to I/O performance analysis. First, we +%x propose a self-scaling benchmark that dynamically adjusts aspects of +%x its workload according to the performance characteristic of the +%x system being measured. By doing so, the benchmark automatically +%x scales across current and future systems. The evaluation aids in +%x understanding system performance by reporting how performance varies +%x according to each of five workload parameters. Second, we propose +%x predicted performance, a technique for using the results from the +%x self-scaling evaluation to estimate quickly the performance for +%x workloads that have not been measured. We show that this technique +%x yields reasonably accurate performance estimates and argue that this +%x method gives a far more accurate comparative performance evaluation +%x than traditional single-point benchmarks. We apply our new +%x evaluation technique by measuring a SPARCstation 1+ with one SCSI +%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running +%x the Sprite LFS operating system with a three-disk disk array, a +%x Convex C240 minisupercomputer with a four-disk disk array, and a +%x Solbourne 5E/905 fileserver with a two-disk disk array. +%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995) +%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995) +%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995) + +%z InProceedings +%K Ousterhout90 +%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990) +%A John K. Ousterhout +%T Why aren't operating systems getting faster as fast as hardware? +%C Proceedings USENIX Summer Conference +%c Anaheim, CA +%D June 1990 +%P 247-256 +%x This paper evaluates several hardware pplatforms and operating systems using +%x a set of benchmarks that stress kernel entry/exit, file systems, and +%x other things related to operating systems. The overall conclusion is that +%x operating system performance is not improving at the same rate as the base speed of the +%x underlying hardware. The most obvious ways to remedy this situation +%x are to improve memory bandwidth and reduce operating systems' +%x tendency to wait for disk operations to complete. +%o Typical performance of 10-20 MIPS cpus is only 0.4 times what +%o their raw hardware performance would suggest. HP-UX is +%o particularly bad on the HP 9000/835, at about 0.2x. (Although +%o this measurement discounted a highly-tuned getpid call.) +%k OS performance, RISC machines, HP9000 Series 835 system calls + +%z InProceedings +%K McVoy91 +%A L. W. McVoy +%A S. R. Kleiman +%T Extent-like Performance from a Unix File System +%C Proceedings USENIX Winter Conference +%c Dallas, TX +%D January 1991 +%P 33-43 + +%z Article +%K Chen93d +%A Peter M. Chen +%A David Patterson +%T Storage performance \- metrics and benchmarks +%J Proceedings of the IEEE +%V 81 +%N 8 +%D August 1993 +%P 1151-1165 +%x Discusses metrics and benchmarks used in storage performance evaluation. +%x Describes, reviews, and runs popular I/O benchmarks on three systems. Also +%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling +%x benchmark with predicted performance. +%k I/O, storage, benchmark, workload, self-scaling benchmark, +%k predicted performance, disk, performance evaluation +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995) + +%z Article +%K Park90a +%A Arvin Park +%A J. C. Becker +%T IOStone: a synthetic file system benchmark +%J Computer Architecture News +%V 18 +%N 2 +%D June 1990 +%P 45-52 +%o this benchmark is useless for all modern systems; it fits +%o completely inside the file system buffer cache. Soon it may even +%o fit inside the processor cache! +%k IOStone, I/O, benchmarks +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995) + +%z Article +%K Fenwick95 +%A David M. Fenwick +%A Denis J. Foley +%A William B. Gist +%A Stephen R. VanDoren +%A Danial Wissell +%T The AlphaServer 8000 series: high-end server platform development +%J Digital Technical Journal +%V 7 +%N 1 +%D August 1995 +%P 43-65 +%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end +%x server products. Both servers are based on the 300Mhz Alpha 21164 +%x microprocessor and on the AlphaServer 8000-series platform architecture. +%x The AlphaServer 8000 platform development team set aggressive system data +%x bandwidth and memory read latency targets in order to achieve high-performance +%x goals. The low-latency criterion was factored into design decisions made at +%x each of the seven layers of platform development. The combination of +%x industry-leading microprocessor technology and a system platform focused +%x on low latency has resulted in a 12-processor server implementation --- +%x the AlphaServer 8400 --- capable of supercomputer levels of performance. +%k DEC Alpha server, performance, memory latency +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995) + +%z Book +%K Toshiba94 +%A Toshiba +%T DRAM Components and Modules +%I Toshiba America Electronic Components, Inc. +%P A59-A77,C37-C42 +%D 1994 + +%z Article +%K McCalpin95 +%A John D. McCalpin +%T Memory bandwidth and machine balance in current high performance computers +%J IEEE Technical Committee on Computer Architecture newsletter +%V to appear +%D December 1995 + +%z Article +%K FSF89 +%A Richard Stallman +%Q Free Software Foundation +%T General Public License +%D 1989 +%O Included with \*[lmbench] diff --git a/performance/lmbench3/doc/references-lmbench3 b/performance/lmbench3/doc/references-lmbench3 new file mode 100644 index 0000000..3f70416 --- /dev/null +++ b/performance/lmbench3/doc/references-lmbench3 @@ -0,0 +1,430 @@ +%z Article +%K Staelin98 +%A Carl Staelin +%A Larry McVoy +%T mhz: Anatomy of a microbenchmark +%B Proceedings USENIX Annual Technical Conference +%C New Orleans, LA +%D June 1998 +%P 155-166 + +%z Article +%K McVoy96 +%A Larry McVoy +%A Carl Staelin +%T lmbench: Portable tools for performance analysis +%B Proceedings USENIX Winter Conference +%C San Diego, CA +%D January 1996 +%P 279-284 + +%K Bray90 +%A Tim Bray +%T Bonnie benchmark +%D 1990 +%o http://www.textuality.com/bonnie/ + +%z Article +%K Brown97 +%A Aaron Brown +%A Margo Seltzer +%T Operating system benchmarking in the wake of lmbench: a case study of the performance of NetBSD on the Intel x86 architecture +%B Proceedings of the 1997 ACM SIGMETRICS Conference on Measurement and Modeling of Computer Systems +%C Seattle, WA +%D June 1997 +%P 214-224 +%o http://www.eecs.harvard.edu/~vino/perf/hbench/sigmetrics/hbench.html + +%z Article +%A Cristina Hristea +%A Danial Lenoski +%A John Keen +%T Measuring memory hierarchy performance of cache-coherent multiprocessors using microbenchmarks +%B Proceedings of Supercomputing '97 +%D November 1997 +%C San Jose, CA +%o http://www.supercomp.org/sc97/proceedings/TECH/HRISTEA/ + +%z Thesis +%K Prestor01 +%A Uros Prestor +%T Evaluating the memory performance of a ccNUMA system +%I Department of Computer Science, University of Utah +%D May 2001 + +%z Thesis +%K Saavedra92 +%A Rafael H. Saavedra-Barrera +%T CPU Performance evaluation and execution time prediction using narrow spectrum benchmarking +%I Department of Computer Science, University of California at Berkeley +%D 1992 + +%z Article +%K Saavedra95 +%A R.H. Saavedra +%A A.J. Smith +%T Measuring cache and TLB performance and their effect on benchmark runtimes +%J IEEE Transactions on Computers +%V 44 +%N 10 +%D October 1995 +%P 1223-1235 + +%z Article +%K Wolman89 +%A Barry L. Wolman +%A Thomas M. Olson +%T IOBENCH: a system independent IO benchmark +%J Computer Architecture News +%V 17 +%N 5 +%D September 1989 +%P 55-70 +%x IOBENCH is an operating system and processor independent synthetic +%x input/output (IO) benchmark designed to put a configurable IO and +%x processor (CP) load on the system under test. This paper discusses +%x the UNIX versions. +%k IOBENCH, synthetic I/O benchmark, UNIX workload +%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991) + +%z Book +%K Hennessy96 +%A John L. Hennessy +%A David A. Patterson +%T Computer Architecture A Quantitative Approach, 2nd Edition +%I Morgan Kaufman +%D 1996 + +%z Book +%K Jain91 +%A Raj Jain +%T The Art of Computer Systems Performance Analysis: Techniques for Experimental Design, Measurement, Simulation, and Modeling +%I Wiley-Interscience +%C New York, NY +%D April 1991 + +%z Article +%K Chen94a +%A P. M. Chen +%A D. A. Patterson +%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance +%D November 1994 +%J Transactions on Computer Systems +%V 12 +%N 4 +%P 308-339 +%x Current I/O benchmarks suffer from several chronic problems: they +%x quickly become obsolete; they do not stress the I/O system; and they +%x do not help much in undelsi;anding I/O system performance. We +%x propose a new approach to I/O performance analysis. First, we +%x propose a self-scaling benchmark that dynamically adjusts aspects of +%x its workload according to the performance characteristic of the +%x system being measured. By doing so, the benchmark automatically +%x scales across current and future systems. The evaluation aids in +%x understanding system performance by reporting how performance varies +%x according to each of five workload parameters. Second, we propose +%x predicted performance, a technique for using the results from the +%x self-scaling evaluation to estimate quickly the performance for +%x workloads that have not been measured. We show that this technique +%x yields reasonably accurate performance estimates and argue that this +%x method gives a far more accurate comparative performance evaluation +%x than traditional single-point benchmarks. We apply our new +%x evaluation technique by measuring a SPARCstation 1+ with one SCSI +%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running +%x the Sprite LFS operating system with a three-disk disk array, a +%x Convex C240 minisupercomputer with a four-disk disk array, and a +%x Solbourne 5E/905 fileserver with a two-disk disk array. +%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995) +%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995) +%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995) + +%z InProceedings +%K Ousterhout90 +%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990) +%A John K. Ousterhout +%T Why aren't operating systems getting faster as fast as hardware? +%B Proceedings USENIX Summer Conference +%C Anaheim, CA +%D June 1990 +%P 247-256 +%x This paper evaluates several hardware pplatforms and operating systems using +%x a set of benchmarks that stress kernel entry/exit, file systems, and +%x other things related to operating systems. The overall conclusion is that +%x operating system performance is not improving at the same rate as the base speed of the +%x underlying hardware. The most obvious ways to remedy this situation +%x are to improve memory bandwidth and reduce operating systems' +%x tendency to wait for disk operations to complete. +%o Typical performance of 10-20 MIPS cpus is only 0.4 times what +%o their raw hardware performance would suggest. HP-UX is +%o particularly bad on the HP 9000/835, at about 0.2x. (Although +%o this measurement discounted a highly-tuned getpid call.) +%k OS performance, RISC machines, HP9000 Series 835 system calls + +%z InProceedings +%K McVoy91 +%A L. W. McVoy +%A S. R. Kleiman +%T Extent-like Performance from a Unix File System +%B Proceedings USENIX Winter Conference +%C Dallas, TX +%D January 1991 +%P 33-43 + +%z Article +%K Chen93d +%A Peter M. Chen +%A David Patterson +%T Storage performance \- metrics and benchmarks +%J Proceedings of the IEEE +%V 81 +%N 8 +%D August 1993 +%P 1151-1165 +%x Discusses metrics and benchmarks used in storage performance evaluation. +%x Describes, reviews, and runs popular I/O benchmarks on three systems. Also +%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling +%x benchmark with predicted performance. +%k I/O, storage, benchmark, workload, self-scaling benchmark, +%k predicted performance, disk, performance evaluation +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995) + +%z Article +%K Park90a +%A Arvin Park +%A J. C. Becker +%T IOStone: a synthetic file system benchmark +%J Computer Architecture News +%V 18 +%N 2 +%D June 1990 +%P 45-52 +%o this benchmark is useless for all modern systems; it fits +%o completely inside the file system buffer cache. Soon it may even +%o fit inside the processor cache! +%k IOStone, I/O, benchmarks +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995) + +%z Article +%K Fenwick95 +%A David M. Fenwick +%A Denis J. Foley +%A William B. Gist +%A Stephen R. VanDoren +%A Danial Wissell +%T The AlphaServer 8000 series: high-end server platform development +%J Digital Technical Journal +%V 7 +%N 1 +%D August 1995 +%P 43-65 +%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end +%x server products. Both servers are based on the 300Mhz Alpha 21164 +%x microprocessor and on the AlphaServer 8000-series platform architecture. +%x The AlphaServer 8000 platform development team set aggressive system data +%x bandwidth and memory read latency targets in order to achieve high-performance +%x goals. The low-latency criterion was factored into design decisions made at +%x each of the seven layers of platform development. The combination of +%x industry-leading microprocessor technology and a system platform focused +%x on low latency has resulted in a 12-processor server implementation --- +%x the AlphaServer 8400 --- capable of supercomputer levels of performance. +%k DEC Alpha server, performance, memory latency +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995) + +%z Book +%K Toshiba94 +%Q Toshiba +%T DRAM Components and Modules +%I Toshiba America Electronic Components, Inc. +%P A59-A77,C37-C42 +%D 1994 + +%z Article +%K McCalpin95 +%A John D. McCalpin +%T Memory bandwidth and machine balance in current high performance computers +%J IEEE Technical Committee on Computer Architecture newsletter +%D December 1995 + +%z Article +%K McCalpin02 +%A John D. McCalpin +%T The STREAM2 home page +%o http://www.cs.virginia.edu/stream/stream2/ +%D 2002 + +%z Article +%K FSF89 +%A Richard Stallman +%Q Free Software Foundation +%T General Public License +%D 1989 +%O Included with \*[lmbench] + +%z Article +%K Shein89 +%A Barry Shein +%A Mike Callahan +%A Paul Woodbury +%T NFSSTONE: A network file server performance benchmark +%B Proceedings USENIX Summer Conference +%C Baltimore, MD +%D June 1989 +%P 269-275 + +%z Article +%K Weicker84 +%A R.P. Weicker +%T Dhrystone: A synthetic systems programming benchmark +%J Communications of the ACM +%V 27 +%N 10 +%P 1013--1030 +%D 1984 + +%z Article +%K Howard88 +%A J. Howard +%A M. Kazar +%A S. Menees +%A S. Nichols +%A M. Satyanrayanan +%A R. Sidebotham +%A M. West +%T Scale and performance in a distributed system +%J ACM Transactions on Computer Systems +%V 6 +%N 1 +%D February 1988 +%P 51--81 +%k Andrew benchmark + +%z Article +%K Banga97 +%A Guarav Banga +%A Peter Druschel +%T Measuring the capacity of a web server +%B Proceedings USENIX Symposium on Internet Technologies and Systems +%C Monterey, CA +%D December 1997 +%P 61--71 + +%z Article +%K Banga98 +%A Guarav Banga +%A Jeffrey C. Mogul +%T Scalable kernel performance for internet servers under realistic loads +%B Proceedings of the 1998 USENIX Annual Technical Conference +%C New Orleans, LA +%D June 1998 +%P 69--83 + +%z Article +%K Mogul99 +%A Jeffrey C. Mogul +%T Brittle metrics in operating systems research +%B Proceedings 7th IEEE Workshop on Hot Topics in Operating Systems (HotOS-VII) +%C Rio Rico, AZ +%P 90--95 +%D March 1999 + +%z Article +%K Regehr2002 +%A John Regehr +%T Inferring scheduling behavior with Hourglass +%B Proceedings of the USENIX Annual Technical Conference FREENIX track +%C Monterey, CA +%D June 2002 +%P 143--156 + +%z Article +%K Seltzer99 +%A Margo Seltzer +%A David Krinsky +%A Keith Smith +%A Xiolan Zhang +%T The case for application-specific benchmarking +%B Proceedings of the 1999 Workshop on Hot Topics in Operating Systems +%C Rico, AZ +%D 1999 +%P 102--107 + +%z Article +%K Smith97 +%A Keith A. Smith +%A Margo L. Seltzer +%T File system aging --- Increasing the relevance of file system benchmarks +%B Proceedings of the 1997 SIGMETRICS Conference +%D June 1997 +%C Seattle, WA +%P 203-213 + +%z Article +%K Tullsen96 +%A Dean Tullsen +%A Susan Eggers +%A Joel Emer +%A Henry Levy +%A Jack Lo +%A Rebecca Stamm +%T Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor +%C Proceedings of the 23rd Annual International Symposium on Computer Architecture +%D May 1996 +%P 191-202 +%O http://www.cs.washington.edu/research/smt/papers/ISCA96.ps + +%z Article +%K Tullsen99 +%A Dean Tullsen +%A Jack Lo +%A Susan Eggers +%A Henry Levy +%T Supporting fine-grain synchronization on a simultaneous multithreaded processor +%B Proceedings of the 5th International Symposium on High Performance Computer Architecture +%D January 1999 +%P 54-58 +%O http://www.cs.washington.edu/research/smt/papers/hpca.ps + +%z Report +%K Whaley97 +%A R. Clint Whaley +%A Jack Dongarra +%T Automatically tuned linear algebra software +%I Department of Computer Science, University of Tennessee +%C Knoxville, TN +%R UT-CS-97-366 +%D 1997 +%o http://math-atlas.sourceforge.net/ + +%z Article +%K SPEChpc96 +%Q Standard Performance Evaluation Corporation +%T SPEC HPC96 benchmark +%D 1996 +%O http://www.specbench.org/hpg/hpc96/ + +%z Article +%K Parkbench +%Q PARallel Kernels and BENCHmarks committee +%T PARKBENCH +%D 2002 +%O http://www.netlib.org/parkbench/ + +%z Article +%K NAS +%Q NASA Advanced Supercomputing Division, NASA Ames Research Center +%T NAS parallel benchmarks +%O http://www.nas.nasa.gov/NAS/NPB + +%z Article +%K Glendinning94 +%A Ian Glendinning +%T GENESIS distributed memory benchmark suite +%O http://wotug.ukc.ac.uk/parallel/performance/benchmarks/genesis +%D 1994 + +%z Article +%K Intel99 +%Q Intel +%T Profusion --- An 8-way symmetric multiprocessing chipset +%O http://netserver.hp.com/docs/download.asp?file=tp_profusion(r).pdf +%D July 1999 diff --git a/performance/lmbench3/doc/references-memhier b/performance/lmbench3/doc/references-memhier new file mode 100755 index 0000000..59306b6 --- /dev/null +++ b/performance/lmbench3/doc/references-memhier @@ -0,0 +1,251 @@ +%z Article +%K Staelin02b +%A Carl Staelin +%T lmbench3: Measuring scalability +%D November 2002 +%I Hewlett-Packard Laboratories +%C Palo Alto, CA + +%z Article +%K Staelin02c +%A Carl Staelin +%T Utilizing intra-processor parallelism +%D December 2002 +%I Hewlett-Packard Laboratories +%C Palo Alto, CA + +%z Article +%K Whaley98 +%A R. Clint Whaley +%A Jack Dongarra +%T Automatically tuned linear algebra software +%C Proceedings of the 1998 ACM/IEEE SC98 Conference +%D 1998 +%O http://sourceforge.net/projects/math-atlas + +%z Article +%K Staelin98 +%A Carl Staelin +%A Larry McVoy +%T mhz: Anatomy of a microbenchmark +%C Proceedings USENIX Annual Technical Conference +%c New Orleans, LA +%D June 1998 +%P 155-166 + +%z Article +%K McVoy96 +%A Larry McVoy +%A Carl Staelin +%T lmbench: Portable tools for performance analysis +%C Proceedings USENIX Winter Conference +%c San Diego, CA +%D January 1996 +%P 279-284 + +%a Thesis +%K Prestor01 +%A Uros Prestor +%T Evaluating the memory performance of a ccNUMA system +%R Masters Thesis +%I School of Computing, University of Utah +%c Salt Lake City, Utah +%D May 2001 +%O http://www.cs.utah.edu/~uros/thesis/thesis.pdf + +%z Article +%K Saavedra95 +%A R.H. Saavedra +%A A.J. Smith +%T Measuring cache and TLB performance and their effect on benchmark runtimes +%J IEEE Transactions on Computers +%V 44 +%N 10 +%D October 1995 +%P 1223-1235 + +%z Article +%K Wolman89 +%A Barry L. Wolman +%A Thomas M. Olson +%T IOBENCH: a system independent IO benchmark +%J Computer Architecture News +%V 17 +%N 5 +%D September 1989 +%P 55-70 +%x IOBENCH is an operating system and processor independent synthetic +%x input/output (IO) benchmark designed to put a configurable IO and +%x processor (CP) load on the system under test. This paper discusses +%x the UNIX versions. +%k IOBENCH, synthetic I/O benchmark, UNIX workload +%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991) + +%z Book +%K Hennessy96 +%A John L. Hennessy +%A David A. Patterson +%T Computer Architecture A Quantitative Approach, 2nd Edition +%I Morgan Kaufman +%D 1996 + +%z Article +%K Chen94a +%A P. M. Chen +%A D. A. Patterson +%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance +%D November 1994 +%J Transactions on Computer Systems +%V 12 +%N 4 +%P 308-339 +%x Current I/O benchmarks suffer from several chronic problems: they +%x quickly become obsolete; they do not stress the I/O system; and they +%x do not help much in undelsi;anding I/O system performance. We +%x propose a new approach to I/O performance analysis. First, we +%x propose a self-scaling benchmark that dynamically adjusts aspects of +%x its workload according to the performance characteristic of the +%x system being measured. By doing so, the benchmark automatically +%x scales across current and future systems. The evaluation aids in +%x understanding system performance by reporting how performance varies +%x according to each of five workload parameters. Second, we propose +%x predicted performance, a technique for using the results from the +%x self-scaling evaluation to estimate quickly the performance for +%x workloads that have not been measured. We show that this technique +%x yields reasonably accurate performance estimates and argue that this +%x method gives a far more accurate comparative performance evaluation +%x than traditional single-point benchmarks. We apply our new +%x evaluation technique by measuring a SPARCstation 1+ with one SCSI +%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running +%x the Sprite LFS operating system with a three-disk disk array, a +%x Convex C240 minisupercomputer with a four-disk disk array, and a +%x Solbourne 5E/905 fileserver with a two-disk disk array. +%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995) +%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995) +%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995) + +%z InProceedings +%K Ousterhout90 +%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990) +%A John K. Ousterhout +%T Why aren't operating systems getting faster as fast as hardware? +%C Proceedings USENIX Summer Conference +%c Anaheim, CA +%D June 1990 +%P 247-256 +%x This paper evaluates several hardware pplatforms and operating systems using +%x a set of benchmarks that stress kernel entry/exit, file systems, and +%x other things related to operating systems. The overall conclusion is that +%x operating system performance is not improving at the same rate as the base speed of the +%x underlying hardware. The most obvious ways to remedy this situation +%x are to improve memory bandwidth and reduce operating systems' +%x tendency to wait for disk operations to complete. +%o Typical performance of 10-20 MIPS cpus is only 0.4 times what +%o their raw hardware performance would suggest. HP-UX is +%o particularly bad on the HP 9000/835, at about 0.2x. (Although +%o this measurement discounted a highly-tuned getpid call.) +%k OS performance, RISC machines, HP9000 Series 835 system calls + +%z InProceedings +%K McVoy91 +%A L. W. McVoy +%A S. R. Kleiman +%T Extent-like Performance from a Unix File System +%C Proceedings USENIX Winter Conference +%c Dallas, TX +%D January 1991 +%P 33-43 + +%z Article +%K Chen93d +%A Peter M. Chen +%A David Patterson +%T Storage performance \- metrics and benchmarks +%J Proceedings of the IEEE +%V 81 +%N 8 +%D August 1993 +%P 1151-1165 +%x Discusses metrics and benchmarks used in storage performance evaluation. +%x Describes, reviews, and runs popular I/O benchmarks on three systems. Also +%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling +%x benchmark with predicted performance. +%k I/O, storage, benchmark, workload, self-scaling benchmark, +%k predicted performance, disk, performance evaluation +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995) + +%z Article +%K Park90a +%A Arvin Park +%A J. C. Becker +%T IOStone: a synthetic file system benchmark +%J Computer Architecture News +%V 18 +%N 2 +%D June 1990 +%P 45-52 +%o this benchmark is useless for all modern systems; it fits +%o completely inside the file system buffer cache. Soon it may even +%o fit inside the processor cache! +%k IOStone, I/O, benchmarks +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995) + +%z Article +%K Fenwick95 +%A David M. Fenwick +%A Denis J. Foley +%A William B. Gist +%A Stephen R. VanDoren +%A Danial Wissell +%T The AlphaServer 8000 series: high-end server platform development +%J Digital Technical Journal +%V 7 +%N 1 +%D August 1995 +%P 43-65 +%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end +%x server products. Both servers are based on the 300Mhz Alpha 21164 +%x microprocessor and on the AlphaServer 8000-series platform architecture. +%x The AlphaServer 8000 platform development team set aggressive system data +%x bandwidth and memory read latency targets in order to achieve high-performance +%x goals. The low-latency criterion was factored into design decisions made at +%x each of the seven layers of platform development. The combination of +%x industry-leading microprocessor technology and a system platform focused +%x on low latency has resulted in a 12-processor server implementation --- +%x the AlphaServer 8400 --- capable of supercomputer levels of performance. +%k DEC Alpha server, performance, memory latency +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995) + +%z Book +%K Toshiba94 +%A Toshiba +%T DRAM Components and Modules +%I Toshiba America Electronic Components, Inc. +%P A59-A77,C37-C42 +%D 1994 + +%z Article +%K McCalpin95 +%A John D. McCalpin +%T Memory bandwidth and machine balance in current high performance computers +%J IEEE Technical Committee on Computer Architecture newsletter +%D December 1995 + +%z Article +%K FSF89 +%A Richard Stallman +%Q Free Software Foundation +%T General Public License +%D 1989 +%O Included with \*[lmbench] + +%z Article +%K Min01 +%A Rui Min +%A Yiming Hu +%T Improving performance of large physically indexed caches by decoupling memory addresses from cache addresses +%J IEEE Transactions on Computers +%V 50 +%N 11 +%D November 2001 +%P 1191-1201 diff --git a/performance/lmbench3/doc/references-parallel b/performance/lmbench3/doc/references-parallel new file mode 100644 index 0000000..869f794 --- /dev/null +++ b/performance/lmbench3/doc/references-parallel @@ -0,0 +1,171 @@ +%z Article +%K Tullsen96 +%A Dean Tullsen +%A Susan Eggers +%A Joel Emer +%A Henry Levy +%A Jack Lo +%A Rebecca Stamm +%T Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor +%C Proceedings of the 23rd Annual International Symposium on Computer Architecture +%D May 1996 +%P 191-202 +%O http://www.cs.washington.edu/research/smt/papers/ISCA96.ps + +%z Article +%K Tullsen99 +%A Dean Tullsen +%A Jack Lo +%A Susan Eggers +%A Henry Levy +%T Suppoerting fine-grain synchronization on a simultaneous multithreaded processor +%C Proceedings of the 5th International Symposium on High Performance Computer Architecture +%D January 1999 +%P 54-58 +%O http://www.cs.washington.edu/research/smt/papers/hpca.ps + +%z Article +%K Kumar97 +%A A. Kumar +%T The HP PA-8000 RISC CPU +%J IEEE Micro +%V 17 +%N 2 +%D March-April 1997 +%P 27-32 + +%z Article +%K Schlansker00 +%A M.S. Schlansker +%A B.R. Rau +%T EPIC: Explicitly parallel instruction computing +%J IEEE Computer +%V 33 +%N 2 +%D Feb. 2000 +%P 37-45 + +%z Article +%K Smith95 +%A James E. Smith +%A Gurindar S. Sohi +%T The microarchitecture of superscalar processors +%J Proceedings of the IEEE +%V 83 +%D October 1995 +%P 1609-1624 + +%z Thesis +%K Munoz97 +%A Raul E. Silvera Munoz +%T Static instruction scheduling for dynamic issue processors +%I ACAPS Laboratory, School of Computer Science, McGill University +%D 1997 + +%z Article +%K Agarwal96 +%A Ramesh K. Agarwal +%T A super scalar sort algorithm for RISC processors +%C Processings 1996 ACM SIGMOD International Conference on Management of Data +%D 1996 +%P 240-246 +%O http://citeseer.nj.nec.com/agarwal96super.html + +%z Article +%K Staelin01a +%A Carl Staelin +%T Analyzing the memory hierarchy +%D October 2001 +%I Hewlett-Packard Laboratories +%C Palo Alto, CA + +%z Article +%K Staelin01b +%A Carl Staelin +%T lmbench3: Measuring scalability +%D October 2001 +%I Hewlett-Packard Laboratories +%C Palo Alto, CA + +%z Article +%K Frigo98 +%A M. Frigo +%A S.G. Johnson +%T FFTW: An adaptive software architecture for the FFT +%C Proceedings 1998 ICASSP +%V 3 +%P 1381-1384 +%O http://www.fftw.org/fftw-paper-icassp.pdf + +%z Article +%K Whaley98 +%A R. Clint Whaley +%A Jack Dongarra +%T Automatically tuned linear algebra software +%C Proceedings of the 1998 ACM/IEEE SC98 Conference +%D 1998 +%O http://sourceforge.net/projects/math-atlas + +%z Article +%K Staelin98 +%A Carl Staelin +%A Larry McVoy +%T mhz: Anatomy of a microbenchmark +%C Proceedings USENIX Annual Technical Conference +%c New Orleans, LA +%D June 1998 +%P 155-166 + +%z Article +%K McVoy96 +%A Larry McVoy +%A Carl Staelin +%T lmbench: Portable tools for performance analysis +%C Proceedings USENIX Winter Conference +%c San Diego, CA +%D January 1996 +%P 279-284 + +%z Thesis +%K Prestor01 +%A Uros Prestor +%T Evaluating the memory performance of a ccNUMA system +%R Masters Thesis +%I School of Computing, University of Utah +%C Salt Lake City, Utah +%D May 2001 +%O http://www.cs.utah.edu/~uros/thesis/thesis.pdf + +%z Article +%K Saavedra95 +%A R.H. Saavedra +%A A.J. Smith +%T Measuring cache and TLB performance and their effect on benchmark runtimes +%J IEEE Transactions on Computers +%V 44 +%N 10 +%D October 1995 +%P 1223-1235 + +%z Book +%K Knuth73 +%A Donald E. Knuth +%T The Art of computer programming, 2nd Edition +%I Addison-Wesley +%D 1973 + +%z Book +%K Hennessy96 +%A John L. Hennessy +%A David A. Patterson +%T Computer Architecture A Quantitative Approach, 2nd Edition +%I Morgan Kaufman +%D 1996 + + +%z Article +%K McCalpin95 +%A John D. McCalpin +%T Memory bandwidth and machine balance in current high performance computers +%J IEEE Technical Committee on Computer Architecture newsletter +%D December 1995 diff --git a/performance/lmbench3/doc/references-userguide b/performance/lmbench3/doc/references-userguide new file mode 100644 index 0000000..f6fea3d --- /dev/null +++ b/performance/lmbench3/doc/references-userguide @@ -0,0 +1,338 @@ +%z Article +%K Banga97 +%A Guarav Banga +%A Peter Druschel +%T Measuring the capacity of a web server +%B Proceedings USENIX Symposium on Internet Technologies and Systems +%C Monterey, CA +%D December 1997 + +%z Article +%K Banga98 +%A Guarav Banga +%A Jeffrey C. Mogul +%T Scalable kernel performance for internet servers under realistic loads +%B Proceedings of the 1998 USENIX Annual Technical Conference +%C New Orleans, LA +%D June 1998 + +%K Bray90 +%A Tim Bray +%T Bonnie benchmark +%D 1990 +%O http://www.textuality.com/bonnie/ + +%z Article +%K Brown97 +%A Aaron Brown +%A Margo Seltzer +%T Operating system benchmarking in the wake of lmbench: A case study of the performance of NetBSD on the Intel x86 architecture +%B Proceedings of the 1997 ACM SIGMETRICS Conference on Measurement and Modeling of Computer Systems +%C Seattle, WA +%D June 1997 +%P 214-224 +%O http://www.eecs.harvard.edu/~vino/perf/hbench/sigmetrics/hbench.html + +%z Article +%K Chen93d +%A Peter M. Chen +%A David Patterson +%T Storage performance \- metrics and benchmarks +%J Proceedings of the IEEE +%V 81 +%N 8 +%D August 1993 +%P 1151-1165 +%x Discusses metrics and benchmarks used in storage performance evaluation. +%x Describes, reviews, and runs popular I/O benchmarks on three systems. Also +%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling +%x benchmark with predicted performance. +%k I/O, storage, benchmark, workload, self-scaling benchmark, +%k predicted performance, disk, performance evaluation +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995) + +%z Article +%K Chen94a +%A P. M. Chen +%A D. A. Patterson +%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance +%D November 1994 +%J Transactions on Computer Systems +%V 12 +%N 4 +%P 308-339 +%x Current I/O benchmarks suffer from several chronic problems: they +%x quickly become obsolete; they do not stress the I/O system; and they +%x do not help much in undelsi;anding I/O system performance. We +%x propose a new approach to I/O performance analysis. First, we +%x propose a self-scaling benchmark that dynamically adjusts aspects of +%x its workload according to the performance characteristic of the +%x system being measured. By doing so, the benchmark automatically +%x scales across current and future systems. The evaluation aids in +%x understanding system performance by reporting how performance varies +%x according to each of five workload parameters. Second, we propose +%x predicted performance, a technique for using the results from the +%x self-scaling evaluation to estimate quickly the performance for +%x workloads that have not been measured. We show that this technique +%x yields reasonably accurate performance estimates and argue that this +%x method gives a far more accurate comparative performance evaluation +%x than traditional single-point benchmarks. We apply our new +%x evaluation technique by measuring a SPARCstation 1+ with one SCSI +%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running +%x the Sprite LFS operating system with a three-disk disk array, a +%x Convex C240 minisupercomputer with a four-disk disk array, and a +%x Solbourne 5E/905 fileserver with a two-disk disk array. +%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995) +%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995) +%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995) + +%z Article +%K Fenwick95 +%A David M. Fenwick +%A Denis J. Foley +%A William B. Gist +%A Stephen R. VanDoren +%A Danial Wissell +%T The AlphaServer 8000 series: high-end server platform development +%J Digital Technical Journal +%V 7 +%N 1 +%D August 1995 +%P 43-65 +%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end +%x server products. Both servers are based on the 300Mhz Alpha 21164 +%x microprocessor and on the AlphaServer 8000-series platform architecture. +%x The AlphaServer 8000 platform development team set aggressive system data +%x bandwidth and memory read latency targets in order to achieve high-performance +%x goals. The low-latency criterion was factored into design decisions made at +%x each of the seven layers of platform development. The combination of +%x industry-leading microprocessor technology and a system platform focused +%x on low latency has resulted in a 12-processor server implementation --- +%x the AlphaServer 8400 --- capable of supercomputer levels of performance. +%k DEC Alpha server, performance, memory latency +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995) + +%z Book +%K Hennessy96 +%A John L. Hennessy +%A David A. Patterson +%T Computer Architecture A Quantitative Approach, 2nd Edition +%I Morgan Kaufman +%D 1996 + +%z Article +%K Howard88 +%A J. Howard +%A M. Kazar +%A S. Menees +%A S. Nichols +%A M. Satyanrayanan +%A R. Sidebotham +%A M. West +%T Scale and performance in a distributed system +%J ACM Transactions on Computer Systems +%V 6 +%N 1 +%D February 1988 +%P 51-81 +%k Andrew benchmark + +%z Book +%K Jain91 +%A Raj Jain +%T The Art of Computer Systems Performance Analysis: Techniques for Experimental Design, Measurement, Simulation, and Modeling +%I Wiley-Interscience +%C New York, NY +%D April 1991 + +%z Article +%K McCalpin95 +%A John D. McCalpin +%T Memory bandwidth and machine balance in current high performance computers +%J IEEE Technical Committee on Computer Architecture newsletter +%D December 1995 + +%z InProceedings +%K McVoy91 +%A L. W. McVoy +%A S. R. Kleiman +%T Extent-like Performance from a Unix File System +%B Proceedings USENIX Winter Conference +%C Dallas, TX +%D January 1991 +%P 33-43 + +%z Article +%K McVoy96 +%A Larry McVoy +%A Carl Staelin +%T lmbench: Portable tools for performance analysis +%B Proceedings USENIX Winter Conference +%C San Diego, CA +%D January 1996 +%P 279-284 + +%z InProceedings +%K Ousterhout90 +%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990) +%A John K. Ousterhout +%T Why aren't operating systems getting faster as fast as hardware? +%B Proceedings USENIX Summer Conference +%C Anaheim, CA +%D June 1990 +%P 247-256 +%x This paper evaluates several hardware pplatforms and operating systems using +%x a set of benchmarks that stress kernel entry/exit, file systems, and +%x other things related to operating systems. The overall conclusion is that +%x operating system performance is not improving at the same rate as the base speed of the +%x underlying hardware. The most obvious ways to remedy this situation +%x are to improve memory bandwidth and reduce operating systems' +%x tendency to wait for disk operations to complete. +%o Typical performance of 10-20 MIPS cpus is only 0.4 times what +%o their raw hardware performance would suggest. HP-UX is +%o particularly bad on the HP 9000/835, at about 0.2x. (Although +%o this measurement discounted a highly-tuned getpid call.) +%k OS performance, RISC machines, HP9000 Series 835 system calls + +%z Article +%K Park90a +%A Arvin Park +%A J. C. Becker +%T IOStone: a synthetic file system benchmark +%J Computer Architecture News +%V 18 +%N 2 +%D June 1990 +%P 45-52 +%o this benchmark is useless for all modern systems; it fits +%o completely inside the file system buffer cache. Soon it may even +%o fit inside the processor cache! +%k IOStone, I/O, benchmarks +%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995) + +%z Thesis +%K Prestor01 +%A Uros Prestor +%T Evaluating the memory performance of a ccNUMA system +%I Department of Computer Science, University of Utah +%D May 2001 + +%z Thesis +%K Saavedra92 +%A Rafael H. Saavedra-Barrera +%T CPU Performance evaluation and execution time prediction using narrow spectrum benchmarking +%I Department of Computer Science, University of California at Berkeley +%D 1992 + +%z Article +%K Saavedra95 +%A R.H. Saavedra +%A A.J. Smith +%T Measuring cache and TLB performance and their effect on benchmark runtimes +%J IEEE Transactions on Computers +%V 44 +%N 10 +%D October 1995 +%P 1223-1235 + +%z Article +%k Seltzer99 +%A Margo Seltzer +%A David Krinsky +%A Keith Smith +%A Xiolan Zhang +%T The case for application-specific benchmarking +%B Proceedings of the 1999 Workshop on Hot Topics in Operating Systems +%C Rico, AZ +%D 1999 + +%z Article +%K Shein89 +%A Barry Shein +%A Mike Callahan +%A Paul Woodbury +%T NFSSTONE: A network file server performance benchmark +%B Proceedings USENIX Summer Conference +%C Baltimore, MD +%D June 1989 +%P 269-275 + +%z Article +%K Staelin98 +%A Carl Staelin +%A Larry McVoy +%T mhz: Anatomy of a microbenchmark +%B Proceedings USENIX Annual Technical Conference +%C New Orleans, LA +%D June 1998 +%P 155-166 + +%z Article +%K FSF89 +%A Richard Stallman +%Q Free Software Foundation +%T General Public License +%D 1989 +%O Included with \*[lmbench] + +%z Book +%K Toshiba94 +%A Toshiba +%T DRAM Components and Modules +%I Toshiba America Electronic Components, Inc. +%P A59-A77,C37-C42 +%D 1994 + +%z Article +%K Tullsen96 +%A Dean Tullsen +%A Susan Eggers +%A Joel Emer +%A Henry Levy +%A Jack Lo +%A Rebecca Stamm +%T Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor +%C Proceedings of the 23rd Annual International Symposium on Computer Architecture +%D May 1996 +%P 191-202 +%O http://www.cs.washington.edu/research/smt/papers/ISCA96.ps + +%z Article +%K Tullsen99 +%A Dean Tullsen +%A Jack Lo +%A Susan Eggers +%A Henry Levy +%T Supporting fine-grain synchronization on a simultaneous multithreaded processor +%B Proceedings of the 5th International Symposium on High Performance Computer Architecture +%D January 1999 +%P 54-58 +%O http://www.cs.washington.edu/research/smt/papers/hpca.ps + +%z Article +%K Weicker84 +%A R.P. Weicker +%T Dhrystone: A synthetic systems programming benchmark +%J CACM +%V 27 +%N 10 +%P 1013-1030 +%D 1984 + +%z Article +%K Wolman89 +%A Barry L. Wolman +%A Thomas M. Olson +%T IOBENCH: a system independent IO benchmark +%J Computer Architecture News +%V 17 +%N 5 +%D September 1989 +%P 55-70 +%x IOBENCH is an operating system and processor independent synthetic +%x input/output (IO) benchmark designed to put a configurable IO and +%x processor (CP) load on the system under test. This paper discusses +%x the UNIX versions. +%k IOBENCH, synthetic I/O benchmark, UNIX workload +%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991) + diff --git a/performance/lmbench3/doc/references.private b/performance/lmbench3/doc/references.private new file mode 100644 index 0000000..7394354 --- /dev/null +++ b/performance/lmbench3/doc/references.private @@ -0,0 +1,7 @@ +%z Article +%K McCalpin95 +%A John D. McCalpin +%T Memory bandwidth and machine balance in current high performance computers +%J IEEE Technical Committee on Computer Architecture newsletter +%V to appear +%D Dec. 1995 diff --git a/performance/lmbench3/doc/reporting.3 b/performance/lmbench3/doc/reporting.3 new file mode 100644 index 0000000..e63124a --- /dev/null +++ b/performance/lmbench3/doc/reporting.3 @@ -0,0 +1,71 @@ +.\" +.\" @(#)lmbench.man 2.0 98/04/24 +.\" +.\" lmbench - benchmarking toolbox +.\" +.\" Copyright (C) 1998 Carl Staelin and Larry McVoy +.\" E-mail: staelin@xxxxxxxxxx +.\" +.TH "lmbench reporting" 3 "$Date:" "(c)1998-2000 Larry McVoy and Carl Staelin" "LMBENCH" +.SH "NAME" +milli, micro, nano, mb, kb \- the lmbench reporting subsystem +.SH "SYNOPSIS" +.B "#include ``lmbench.h''" +.LP +.B "void milli(char *s, uint64 n)" +.LP +.B "void micro(char *s, uint64 n)" +.LP +.B "void nano(char *s, uint64 n)" +.LP +.B "void mb(uint64 bytes)" +.LP +.B "void kb(uint64 bytes)" +.SH "DESCRIPTION" +Creating benchmarks using the +.I lmbench +timing harness is easy. +Since it is so easy to measure performance using +.IR lmbench , +it is possible to quickly answer questions that arise during system +design, development, or tuning. For example, image processing +.LP +There are two attributes that are critical for performance, latency +and bandwidth, and +.IR lmbench 's +timing harness makes it easy to measure and report results for both. +The measurement interface, +.B benchmp +is the same, but the reporting functions are different. +Latency is usually important for frequently executed operations, and +bandwidth is usually important when moving large chunks of data. +.TP +.B "void milli(char *s, uint64 n)" +print out the time per operation in milli-seconds. +.I n +is the number of operations during the timing interval, which is passed +as a parameter because each +.I loop_body +can contain several operations. +.TP +.B "void micro(char *s, uint64 n)" +print the time per opertaion in micro-seconds. +.TP +.B "void nano(char *s, uint64 n)" +print the time per operation in nano-seconds. +.TP +.B "void mb(uint64 bytes)" +print the bandwidth in megabytes per second. +.TP +.B "void kb(uint64 bytes)" +print the bandwidth in kilobytes per second. +.SH "FUTURES" +Development of +.I lmbench +is continuing. +.SH "SEE ALSO" +lmbench(8), lmbench(3), timing(3), results(3) +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/results.3 b/performance/lmbench3/doc/results.3 new file mode 100644 index 0000000..b6d099d --- /dev/null +++ b/performance/lmbench3/doc/results.3 @@ -0,0 +1,88 @@ +.\" +.\" @(#)results.man 2.0 98/04/24 +.\" +.\" results - lmbench results subsystem +.\" +.\" Copyright (C) 1998 Carl Staelin and Larry McVoy +.\" E-mail: staelin@xxxxxxxxxx +.\" +.TH "lmbench result management" 3 "$Date:$" "(c)1998 Larry McVoy" "LMBENCH" +.SH "NAME" +insertinit, insertsort, get_results, set_results, save_median, save_minimum + \- the lmbench results subsystem +.SH "SYNOPSIS" +.B "#include ``lmbench.h''" +.LP +.B "#define TRIES 11" +.LP +.B "typedef struct { uint64 u, n } value_t;" +.LP +.B "typedef struct { int N; value_t v[TRIES]; } result_t;" +.LP +.B "int sizeof_result(int N)" +.LP +.B "void insertinit(result_t *r)" +.LP +.B "void insertsort(uint64 u, uint64 n, result_t *r)" +.LP +.B "result_t* get_results()" +.LP +.B "void set_results(result_t *r)" +.LP +.B "void save_median()" +.LP +.B "void save_minimum()" +.SH "DESCRIPTION" +These routines provide some simple data management functionality. +In most cases, you will not need these routines. +.LP +The current timing results can be accessed using the routines in +timing(3). The current timing results may be modified using +.B save_median +and +.BR save_minimum . +.TP +.B "int sizeof_result(int N)" +returns the number of bytes to allocate for a result_t which contains +.I N +results. +.TP +.B "void insertinit(result_t *r)" +initializes the results array. +.TP +.B "void insertsort(uint64 u, uint64 n, result_t *r)" +insert +.I u +and +.I n +into +.IR r . +Results are sorted in decreasing order by +.IR u/n . +.TP +.B "void get_results(result_t *r)" +get a copy of the current results. +.TP +.B "void set_results(result_t *r)" +save a copy +.I r +as the current results. +.TP +.B "void save_median()" +sets the timing results to the median of the current results. +.TP +.B "void save_minimum()" +sets the timing restuls to the minimum of the current results. +.LP +Results are sorted in ascending order, so the minimum value is at +.B TRIES-1 +and the maximum value is at +.BR 0 . +.SH "FUTURES" +Development of \fIlmbench\fR is continuing. +.SH "SEE ALSO" +lmbench(8), lmbench(3), reporting(3), results(3) +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/stream.8 b/performance/lmbench3/doc/stream.8 new file mode 100644 index 0000000..762c710 --- /dev/null +++ b/performance/lmbench3/doc/stream.8 @@ -0,0 +1,28 @@ +.\" $Id$ +.TH stream 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +stream \- John McCalpin's STREAM benchmark +.SH SYNOPSIS +.B stream +[ +.I "-M <len>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B stream +mimics John McCalpin's STREAM benchmark. It measures memory bandwidth. +.SH BUGS +.B stream +is an experimental benchmark, but it seems to work well on most +systems. +.SH "SEE ALSO" +lmbench(8), bw_mem(8), line(8), tlb(8), cache(8), par_mem(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/timing.3 b/performance/lmbench3/doc/timing.3 new file mode 100644 index 0000000..2ebea3a --- /dev/null +++ b/performance/lmbench3/doc/timing.3 @@ -0,0 +1,163 @@ +.\" +.\" @(#)timing.man 2.0 98/04/24 +.\" +.\" timing - lmbench timing subsystem +.\" +.\" Copyright (C) 1998 Carl Staelin and Larry McVoy +.\" E-mail: staelin@xxxxxxxxxx +.\" +.TH "lmbench timing" 3 "$Date:$" "(c)1998 Larry McVoy" "LMBENCH" + +.SH "NAME" +benchmp, benchmp_getstate, benchmp_interval, + start, stop, get_n, set_n, gettime, settime, + get_enough, t_overhead, l_overhead \- the lmbench timing subsystem +.SH "SYNOPSIS" +.B "#include ``lmbench.h''" +.LP +.B "typedef u_long iter_t" +.LP +.B "typedef (*bench_f)(iter_t iterations, void* cookie)" +.LP +.B "typedef (*support_f)(iter_t iterations, void* cookie)" +.LP +.B "void benchmp(support_f initialize, bench_f benchmark, support_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie)" +.LP +.B "void* benchmp_getstate()" +.LP +.B "iter_t benchmp_interval(void* state)" +.LP +.B "void start(struct timeval *begin)" +.LP +.B "uint64 stop(struct timeval *begin, struct timeval *end)" +.LP +.B "uint64 get_n()" +.LP +.B "void set_n(uint64 n)" +.LP +.B "uint64 gettime()" +.LP +.B "void settime(uint64 u)" +.LP +.B "uint64 get_enough(uint64 enough)" +.LP +.B "uint64 t_overhead()" +.LP +.B "double l_overhead()" +.SH "DESCRIPTION" +The single most important element of a good benchmarking system is +the quality and reliability of its measurement system. +.IR lmbench 's +timing subsystem manages the experimental timing process to produce +accurate results in the least possible time. +.I lmbench +includes methods for measuring and eliminating several factors that +influence the accuracy of timing measurements, such as the resolution +of the system clock. +.LP +.I lmbench +gets accurate results by considering clock resolution, +auto-sizing the duration of each benchmark, and conducting multiple +experiments. +.TP +.B "void benchmp(initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie)" +measures the performance of +.I benchmark +repeatedly and reports the median result. +.I benchmp +creates +.I parallel +sub-processes which run +.I benchmark +in parallel. This allows lmbench to measure the system's ability to +scale as the number of client processes increases. Each sub-process +executes +.I initialize +before starting the benchmarking cycle. It will call +.I benchmark +several times in order to collect +.I repetitions +results. After all the benchmark results have been collected, +.I cleanup +is called to cleanup any resources which may have been allocated +by +.I initialize +or +.I benchmark . +.I cookie +is a void pointer to a hunk of memory that can be used to store any +parameters or state that is needed by the benchmark. +.TP +.B "void benchmp_getstate()" +returns a void pointer to the lmbench-internal state used during +benchmarking. The state is not to be used or accessed directly +by clients, but rather would be passed into +.I benchmp_interval. +.TP +.B "iter_t benchmp_interval(void* state)" +returns the number of times the benchmark should execute its +benchmark loop during this timing interval. This is used only +for weird benchmarks which cannot implement the benchmark +body in a function which can return, such as the page fault +handler. Please see +.I lat_sig.c +for sample usage. +.TP +.B "void start(struct timeval *begin)" +starts a timing interval. If +.I begin +is non-null, save the start time in +.I begin . +.TP +.B "uint64 stop(struct timeval *begin, struct timeval *end)" +stops a timing interval, returning the number of elapsed micro-seconds. +.TP +.B "uint64 get_n()" +returns the number of times +.I loop_body +was executed during the timing interval. +.TP +.B "void set_n(uint64 n)" +sets the number of times +.I loop_body +was executed during the timing interval. +.TP +.B "uint64 gettime()" +returns the number of micro-seconds in the timing interval. +.TP +.B "void settime(uint64 u)" +sets the number of micro-seconds in the timing interval. +.TP +.B "uint64 get_enough(uint64 enough)" +return the time in micro-seconds needed to accurately measure a timing +interval. +.TP +.B "uint64 t_overhead()" +return the time in micro-seconds needed to measure time. +.TP +.B "double l_overhead()" +return the time in micro-seconds needed to do a simple loop. +.SH "VARIABLES" +There are three environment variables that can be used to modify +the +.I lmbench +timing subsystem: ENOUGH, TIMING_O, and LOOP_O. +The environment variables can be used to directly set the results +of +.B get_enough , +.B t_overhead , +and +.B l_overhead . +When running a large number of benchmarks, or repeating the same +benchmark many times, this can save time by eliminating the necessity +of recalculating these values for each run. +.SH "FUTURES" +Development of +.I lmbench +is continuing. +.SH "SEE ALSO" +lmbench(8), lmbench(3), reporting(3), results(3). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/tlb.8 b/performance/lmbench3/doc/tlb.8 new file mode 100644 index 0000000..b95920b --- /dev/null +++ b/performance/lmbench3/doc/tlb.8 @@ -0,0 +1,55 @@ +.\" $Id$ +.TH TLB 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" +.SH NAME +tlb \- TLB size and latency benchmark +.SH SYNOPSIS +.B tlb +[ +.I "-L <line size>" +] +[ +.I "-M <len>" +] +[ +.I "-W <warmups>" +] +[ +.I "-N <repetitions>" +] +.SH DESCRIPTION +.B tlb +tries to determine the size, in pages, of the TLB. +The largest amount of memory it will examine is +.I len +bytes. +.LP +.B tlb +compares the memory latency for two different pointer chains. +The two chains occupy the same amount of cache space, but they stress +the memory subsystem differently. The first chain accesses one word +per page, while the second chain +randomly jumps through all the lines on a page before jumping to the +next page. When all of the pointers reside in the cache (which is the +usual case), and all of the pages for the first chain reside in the +TLB, then the average memory latencies should be identical. Assuming +there is a fixed size TLB, then at some point the number of pages +accessed by the first page will be larger than the TLB. At this point +the average latency for each memory access for the first chain will be +a cache hit plus some fraction of a TLB miss. +.LP +Once the TLB boundary is located +.B tlb +reports the TLB miss latency as the TLB latency for twice as many +pages as the TLB can hold. +.SH BUGS +.B tlb +is an experimental benchmark, but it seems to work well on most +systems. However, if a processor has a TLB hierarchy +.B tlb +only finds the top level TLB. +.SH "SEE ALSO" +lmbench(8), line(8), cache(8), par_mem(8). +.SH "AUTHOR" +Carl Staelin and Larry McVoy +.PP +Comments, suggestions, and bug reports are always welcome. diff --git a/performance/lmbench3/doc/tmac.usenix b/performance/lmbench3/doc/tmac.usenix new file mode 100644 index 0000000..e66ac1f --- /dev/null +++ b/performance/lmbench3/doc/tmac.usenix @@ -0,0 +1,1848 @@ +.ig +Copyright (C) 1990, 1991 Free Software Foundation, Inc. + Written by James Clark (jjc@xxxxxxxxxxx) + +This file is part of groff. + +groff is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 1, or (at your option) any later +version. + +groff is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License along +with groff; see the file LICENSE. If not, write to the Free Software +Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. +.. +.if !\n(.g .ab These ms macros require groff. +.if \n(.C \ +. ab The groff ms macros do not work in compatibility mode. +.\" Enable warnings. You can delete this if you want. +.warn +.\" See if already loaded. +.if r GS .nx /dev/null +.nr GS 1 +.de @error +.tm \\n(.F:\\n(.c: macro error: \\$* +.. +.de @warning +.tm \\n(.F:\\n(.c: macro warning: \\$* +.. +.de @fatal +.ab \\n(.F:\\n(.c: fatal macro error: \\$* +.. +.de @not-implemented +.@error sorry, \\$0 not implemented +.als \\$0 @nop +.. +.als TM @not-implemented +.als CT @not-implemented +.de @nop +.. +.de @init +.\" a non-empty environment +.ev ne +\c +.ev +.ev nf +'nf +.ev +.. +.ds REFERENCES References +.ds ABSTRACT ABSTRACT +.ds TOC Table of Contents +.ds MONTH1 January +.ds MONTH2 February +.ds MONTH3 March +.ds MONTH4 April +.ds MONTH5 May +.ds MONTH6 June +.ds MONTH7 July +.ds MONTH8 August +.ds MONTH9 September +.ds MONTH10 October +.ds MONTH11 November +.ds MONTH12 December +.ds MO \\*[MONTH\n[mo]] +.nr *year \n[yr]+1900 +.ds DY \n[dy] \*[MO] \n[*year] +.de ND +.if \\n[.$] .ds DY "\\$* +.. +.de DA +.if \\n[.$] .ds DY "\\$* +.ds CF \\*[DY] +.. +.\" indexing +.de IX +.tm \\$1\t\\$2\t\\$3\t\\$4 ... \\n[PN] +.. +.\" print an error message and then try to recover +.de @error-recover +.@error \\$@ (recovering) +.nr *pop-count 0 +.while !'\\n(.z'' \{\ +. \"@warning automatically terminating diversion \\n(.z +. ie d @div-end!\\n(.z .@div-end!\\n(.z +. el .*div-end-default +. nr *pop-count +1 +. \" ensure that we don't loop forever +. if \\n[*pop-count]>20 .@fatal recovery failed +.\} +.while !'\\n[.ev]'0' .ev +.par@reset-env +.par@reset +.. +.de *div-end-default +.ds *last-div \\n(.z +.br +.di +.ev nf +.\\*[*last-div] +.ev +.. +.\" **************************** +.\" ******** module cov ******** +.\" **************************** +.\" Cover sheet and first page. +.de cov*err-not-after-first-page +.@error \\$0 is not allowed after the first page has started +.. +.de cov*err-not-before-tl +.@error \\$0 is not allowed before TL +.. +.de cov*err-not-again +.@error \\$0 is not allowed more than once +.. +.de cov*err-not-after-ab +.@error \\$0 is not allowed after first AB, LP, PP, IP, SH or NH +.. +.als AU cov*err-not-before-tl +.als AI cov*err-not-before-tl +.als AB cov*err-not-before-tl +.de cov*first-page-init +.rm cov*first-page-init +.par@init +.als RP cov*err-not-after-first-page +.@init +.ie \\n[cov*rp-format] \{\ +. pg@cs-top +. als FS cov*FS +. als FE cov*FE +.\} +.el \{\ +. pg@top +. als FS @FS +. als FE @FE +.\} +.wh 0 pg@top +.. +.wh 0 cov*first-page-init +.\" This handles the case where FS occurs before TL or LP. +.de FS +.br +\\*[FS]\\ +.. +.nr cov*rp-format 0 +.nr cov*rp-no 0 +.\" released paper format +.de RP +.nr cov*rp-format 1 +.if \\n[.$] .if '\\$1'no' .nr cov*rp-no 1 +.pn 0 +.. +.de TL +.br +.als TL cov*err-not-again +.rn @AB AB +.rn @AU AU +.rn @AI AI +.di cov*tl-div +.par@reset +.ft 3 +.ie \\n[VARPS] \{\ +. ps 14 +. vs 16 +.\} +.el \{\ +. ps +2 +. vs +3p +.\} +.ll (u;\\n[LL]*5/6) +.nr cov*n-au 0 +.. +.de @AU +.par@reset +.if !'\\n(.z'' \{\ +. br +. di +.\} +.nr cov*n-au +1 +.di cov*au-div!\\n[cov*n-au] +.nf +.ft 2 +.ps \\n[PS] +.. +.de @AI +.par@reset +.if !'\\n(.z'' \{\ +. br +. di +.\} +.ie !\\n[cov*n-au] .@error AI before AU +.el \{\ +. di cov*ai-div!\\n[cov*n-au] +. nf +. ft 1 +. ps \\n[PS] +.\} +.. +.de LP +.if !'\\n[.z]'' \{\ +. br +. di +.\} +.br +.cov*ab-init +.cov*print +\\*[\\$0]\\ +.. +.als IP LP +.als PP LP +.als XP LP +.als NH LP +.als SH LP +.als MC LP +.als RT LP +.de cov*ab-init +.als cov*ab-init @nop +.als LP @LP +.als IP @IP +.als PP @PP +.als XP @XP +.als RT @RT +.als SH @SH +.als NH @NH +.als QP @QP +.als RS @RS +.als RE @RE +.als QS @QS +.als QE @QE +.als MC @MC +.als EQ @EQ +.als EN @EN +.als AB cov*err-not-after-ab +.als AU par@AU +.als AI par@AI +.als TL par@TL +.. +.de @AB +.if !'\\n(.z'' \{\ +. br +. di +.\} +.cov*ab-init +.di cov*ab-div +.par@ab-indent +.par@reset +.if !'\\$1'no' \{\ +. ft 2 +. ce 1 +\\*[ABSTRACT] +. sp +. ft 1 +.\} +.ns +.@PP +.. +.de AE +.ie '\\n(.z'cov*ab-div' \{\ +. als AE cov*err-not-again +. br +. di +.\" nr cov*ab-height \\n[dn] +. par@reset-env +. par@reset +. cov*print +.\} +.el .@error AE without AB +.. +.de @div-end!cov*ab-div +.AE +.. +.de cov*print +.als cov*print @nop +.ie d cov*tl-div \{\ +. ie \\n[cov*rp-format] .cov*rp-print +. el .cov*draft-print +.\} +.el \{\ +. if \\n[cov*rp-format] \{\ +. @warning RP format but no TL +. bp 1 +. als FS @FS +. als FE @FE +. \} +. br +.\} +.. +.de cov*rp-print +.nr cov*page-length \\n[.p] +.pl 1000i +.cov*tl-au-print +.sp 3 +.if d cov*ab-div \{\ +. nf +. cov*ab-div +.\} +.sp 3 +.par@reset +\\*[DY] +.br +.if \\n[cov*fn-height] \{\ +. sp |(u;\\n[cov*page-length]-\\n[FM]\ +-\\n[cov*fn-height]-\\n[fn@sep-dist]>?\\n[nl]) +. fn@print-sep +. ev nf +. cov*fn-div +. ev +. ie \\n[cov*rp-no] .rm cov*fn-div +. el \{\ +. rn cov*fn-div fn@overflow-div +. nr fn@have-overflow 1 +. \} +.\} +.als FS @FS +.als FE @FE +.\" If anything was printed below where the footer line is normally printed, +.\" then that's an overflow. +.if -\\n[FM]/2+1v+\\n[cov*page-length]<\\n[nl] .@error cover sheet overflow +.pl \\n[cov*page-length]u +.bp 1 +.if !\\n[cov*rp-no] .cov*tl-au-print +.rs +.sp 1 +.. +.de cov*draft-print +.cov*tl-au-print +.if d cov*ab-div \{\ +. nf +. sp 2 +. cov*ab-div +.\} +.sp 1 +.. +.de cov*tl-au-print +.par@reset +.nf +.rs +.sp 3 +.ce 9999 +.cov*tl-div +.nr cov*i 1 +.nr cov*sp 1v +.while \\n[cov*i]<=\\n[cov*n-au] \{\ +. sp \\n[cov*sp]u +. cov*au-div!\\n[cov*i] +. ie d cov*ai-div!\\n[cov*i] \{\ +. sp .5v +. cov*ai-div!\\n[cov*i] +. nr cov*sp 1v +. \} +. el .nr cov*sp .5v +. nr cov*i +1 +.\} +.ce 0 +.. +.nr cov*fn-height 0 +.nr cov*in-fn 0 +.\" start of footnote on cover +.de cov*FS +.if \\n[cov*in-fn] \{\ +. @error nested FS +. FE +.\} +.nr cov*in-fn 1 +.ev fn +.par@reset-env +.da cov*fn-div +.if !\\n[cov*fn-height] .ns +.ie \\n[.$] .FP "\\$1" no +.el .@LP +.. +.de @div-end!cov*fn-div +.cov*FE +.. +.\" end of footnote on cover +.de cov*FE +.ie '\\n(.z'cov*fn-div' \{\ +. br +. ev +. di +. nr cov*in-fn 0 +. nr cov*fn-height +\\n[dn] +.\} +.el .@error FE without matching FS +.. +.\" *************************** +.\" ******** module pg ******** +.\" *************************** +.\" Page-level formatting. +.\" > 0 if we have a footnote on the current page +.nr pg@fn-flag 0 +.nr pg@colw 0 +.nr pg@fn-colw 0 +.nr HM 1i +.nr FM 1i +.nr PO 1.25i +.ds LF +.ds CF +.ds RF +.ds LH +.ds CH -\\n[PN]- +.ds RH +.ds pg*OH '\\*[LH]'\\*[CH]'\\*[RH]' +.ds pg*EH '\\*[LH]'\\*[CH]'\\*[RH]' +.ds pg*OF '\\*[LF]'\\*[CF]'\\*[RF]' +.ds pg*EF '\\*[LF]'\\*[CF]'\\*[RF]' +.de OH +.ds pg*\\$0 "\\$* +.. +.als EH OH +.als OF OH +.als EF OH +.de PT +.ie \\n%=1 .if \\n[pg*P1] .tl \\*[pg*OH] +.el \{\ +. ie o .tl \\*[pg*OH] +. el .tl \\*[pg*EH] +.\} +.. +.de BT +.ie o .tl \\*[pg*OF] +.el .tl \\*[pg*EF] +.. +.nr pg*P1 0 +.de P1 +.nr pg*P1 1 +.. +.wh -\n[FM]u pg@bottom +.wh -\n[FM]u/2u pg*footer +.nr MINGW 2n +.nr pg@ncols 1 +.de @MC +.if !'\\n(.z'' .error-recover MC while diversion open +.br +.ie \\n[pg@ncols]>1 .pg@super-eject +.el \{\ +. \" flush out any floating keeps +. while \\n[kp@tail]>\\n[kp@head] \{\ +. rs +. bp +. \} +.\} +.ie !\\n(.$ \{\ +. nr pg@colw \\n[LL]*7/15 +. nr pg*gutw \\n[LL]-(2*\\n[pg@colw]) +. nr pg@ncols 2 +.\} +.el \{\ +. nr pg@colw (n;\\$1)<?\\n[LL] +. ie \\n[.$]<2 .nr pg*gutw \\n[MINGW] +. el .nr pg*gutw (n;\\$2) +. nr pg@ncols \\n[LL]-\\n[pg@colw]/(\\n[pg@colw]+\\n[pg*gutw])+1 +. ie \\n[pg@ncols]>1 \ +. nr pg*gutw \\n[LL]-(\\n[pg@ncols]*\\n[pg@colw])/(\\n[pg@ncols]-1) +. el .nr pg*gutw 0 +.\} +.mk pg*col-top +.ns +.nr pg*col-num 0 +.nr pg@fn-colw \\n[pg@colw]*5/6 +.par@reset +.. +.de 2C +.MC +.. +.de 1C +.MC \\n[LL]u +.. +.\" top of page macro +.de pg@top +.ch pg*footer -\\n[FM]u/2u +.nr PN \\n% +.nr pg*col-num 0 +.nr pg@fn-bottom-margin 0 +.nr pg*saved-po \\n[PO] +.po \\n[PO]u +.ev h +.par@reset +.sp (u;\\n[HM]/2) +.PT +.sp |\\n[HM]u +.if d HD .HD +.mk pg@header-bottom +.ev +.mk pg*col-top +.pg*start-col +.. +.de pg*start-col +.\" Handle footnote overflow before floating keeps, because the keep +.\" might contain an embedded footnote. +.fn@top-hook +.kp@top-hook +.tbl@top-hook +.ns +.. +.de pg@cs-top +.sp \\n[HM]u +.\" move pg@bottom and pg*footer out of the way +.ch pg@bottom \\n[.p]u*2u +.ch pg*footer \\n[.p]u*2u +.ns +.. +.de pg@bottom +.tbl@bottom-hook +.if \\n[pg@fn-flag] .fn@bottom-hook +.nr pg*col-num +1 +.ie \\n[pg*col-num]<\\n[pg@ncols] .pg*end-col +.el .pg*end-page +.. +.de pg*end-col +'sp |\\n[pg*col-top]u +.po (u;\\n[pg*saved-po]+(\\n[pg@colw]+\\n[pg*gutw]*\\n[pg*col-num])) +.\"po +(u;\\n[pg@colw]+\\n[pg*gutw]) +.pg*start-col +.. +.de pg*end-page +.po \\n[pg*saved-po]u +.\" Make sure we don't exit if there are still floats or footnotes left-over. +.ie \\n[kp@head]<\\n[kp@tail]:\\n[fn@have-overflow] \{\ +. \" Switching environments ensures that we don't get an unnecessary +. \" blank line at the top of the page. +. ev ne +' bp +. ev +.\} +.el \{\ +. if r pg*next-number \{\ +. pn \\n[pg*next-number] +. rr pg*next-number +. if d pg*next-format \{\ +. af PN \\*[pg*next-format] +. rm pg*next-format +. \} +. \} +' bp +.\} +.. +.\" pg@begin number format +.de pg@begin +.ie \\n[.$]>0 \{\ +. nr pg*next-number (;\\$1) +. ie \\n[.$]>1 .ds pg*next-format \\$2 +. el .rm pg*next-format +.\} +.el .rr pg*next-number +.pg@super-eject +.. +.\" print the footer line +.de pg*footer +.ev h +.par@reset +.BT +.ev +.. +.\" flush out any keeps or footnotes +.de pg@super-eject +.br +.if !'\\n(.z'' .@error-recover diversion open while ejecting page +.\" Make sure we stay in the end macro while there is still footnote overflow +.\" left, or floating keeps. +.while \\n[kp@tail]>\\n[kp@head]:\\n[pg@fn-flag] \{\ +. rs +. bp +.\} +.bp +.. +.em pg@super-eject +.\" *************************** +.\" ******** module fn ******** +.\" *************************** +.\" Footnotes. +.nr fn@sep-dist 8p +.ev fn +.\" Round it vertically +.vs \n[fn@sep-dist]u +.nr fn@sep-dist \n[.v] +.ev +.nr fn*text-num 0 1 +.nr fn*note-num 0 1 +.ds * \\*[par@sup-start]\En+[fn*text-num]\\*[par@sup-end] +.nr fn*open 0 +.\" normal FS +.de @FS +.ie \\n[.$] .fn*do-FS "\\$1" no +.el \{\ +. ie \\n[fn*text-num]>\\n[fn*note-num] .fn*do-FS \\n+[fn*note-num] +. el .fn*do-FS +.\} +.. +.\" Second argument of `no' means don't embellish the first argument. +.de fn*do-FS +.if \\n[fn*open] .@error-recover nested FS +.nr fn*open 1 +.if \\n[.u] \{\ +. \" Ensure that the first line of the footnote is on the same page +. \" as the reference. I think this is minimal. +. ev fn +. nr fn*need 1v +. ev +. ie \\n[pg@fn-flag] .nr fn*need +\\n[fn:PD] +. el .nr fn*need +\\n[fn@sep-dist] +. ne \\n[fn*need]u+\\n[.V]u>?0 +.\} +.ev fn +.par@reset-env +.fn*start-div +.par@reset +.ie \\n[.$] .FP \\$@ +.el .@LP +.. +.de @FE +.ie !\\n[fn*open] .@error FE without FS +.el \{\ +. nr fn*open 0 +. br +. ev +. fn*end-div +.\} +.. +.nr fn@have-overflow 0 +.\" called at the top of each column +.de fn@top-hook +.nr fn*max-width 0 +.nr fn*page-bottom-pos 0-\\n[FM]-\\n[pg@fn-bottom-margin] +.ch pg@bottom \\n[fn*page-bottom-pos]u +.if \\n[fn@have-overflow] \{\ +. nr fn@have-overflow 0 +. fn*start-div +. ev nf +. fn@overflow-div +. ev +. fn*end-div +.\} +.. +.\" This is called at the bottom of the column if pg@fn-flag is set. +.de fn@bottom-hook +.nr pg@fn-flag 0 +.nr fn@have-overflow 0 +.nr fn@bottom-pos \\n[.p]-\\n[FM]-\\n[pg@fn-bottom-margin]+\\n[.v] +.ev fn +.nr fn@bottom-pos -\\n[.v] +.ev +.ie \\n[nl]+\\n[fn@sep-dist]+\n[.V]>\\n[fn@bottom-pos] \{\ +. rn fn@div fn@overflow-div +. nr fn@have-overflow 1 +.\} +.el \{\ +. if \\n[pg@ncols]>1 \ +. if \\n[fn*max-width]>\\n[pg@fn-colw] \ +. nr pg@fn-bottom-margin \\n[.p]-\\n[FM]-\\n[nl]+1v +. wh \\n[fn@bottom-pos]u fn*catch-overflow +. fn@print-sep +. ev nf +. fn@div +. rm fn@div +. ev +. if '\\n(.z'fn@overflow-div' \{\ +. di +. nr fn@have-overflow \\n[dn]>0 +. \} +. ch fn*catch-overflow +.\} +.. +.de fn*catch-overflow +.di fn@overflow-div +.. +.nr fn*embed-count 0 +.de @div-end!fn@div +.br +.if '\\n[.ev]'fn' .ev +.fn*end-div +.nr fn*open 0 +.. +.als @div-end!fn*embed-div @div-end!fn@div +.de fn*start-div +.ie '\\n(.z'' \{\ +. da fn@div +. if !\\n[pg@fn-flag] .ns +.\} +.el .di fn*embed-div +.. +.de fn*end-div +.ie '\\n(.z'fn@div' \{\ +. di +. nr fn*page-bottom-pos -\\n[dn] +. nr fn*max-width \\n[fn*max-width]>?\\n[dl] +. if !\\n[pg@fn-flag] .nr fn*page-bottom-pos -\\n[fn@sep-dist] +. nr pg@fn-flag 1 +. nr fn*page-bottom-pos \\n[nl]-\\n[.p]+\n[.V]>?\\n[fn*page-bottom-pos] +. ch pg@bottom \\n[fn*page-bottom-pos]u +.\} +.el \{\ +. ie '\\n(.z'fn*embed-div' \{\ +. di +. rn fn*embed-div fn*embed-div!\\n[fn*embed-count] +\!. fn*embed-start \\n[fn*embed-count] +. rs +' sp (u;\\n[dn]+\\n[fn@sep-dist]+\\n[.V]) +\!. fn*embed-end +. nr fn*embed-count +1 +. \} +. el \{\ +. ev fn +. @error-recover unclosed diversion within footnote +. \} +.\} +.. +.de fn*embed-start +.ie '\\n(.z'' \{\ +. fn*start-div +. ev nf +. fn*embed-div!\\$1 +. rm fn*embed-div!\\$1 +. ev +. fn*end-div +. di fn*null +.\} +.el \{\ +\!. fn*embed-start \\$1 +. rs +.\} +.. +.de fn*embed-end +.ie '\\n(.z'fn*null' \{\ +. di +. rm fn*null +.\} +.el \!.fn*embed-end +.. +.\" It's important that fn@print-sep use up exactly fn@sep-dist vertical space. +.de fn@print-sep +.ev fn +.in 0 +.vs \\n[fn@sep-dist]u +\D'l 1i 0' +.br +.ev +.. +.\" *************************** +.\" ******** module kp ******** +.\" *************************** +.\" Keeps. +.de KS +.br +.di kp*div +.. +.de KF +.if !'\\n(.z'' .@error-recover KF while open diversion +.di kp*fdiv +.ev k +.par@reset-env +.par@reset +.. +.de KE +.ie '\\n(.z'kp*div' .kp*end +.el \{\ +. ie '\\n(.z'kp*fdiv' .kp*fend +. el .@error KE without KS or KF +.\} +.. +.de @div-end!kp*div +.kp*end +.. +.de @div-end!kp*fdiv +.kp*fend +.. +.de kp*need +.ie '\\n(.z'' .ds@need \\$1 +.el \!.kp*need \\$1 +.. +.\" end non-floating keep +.de kp*end +.br +.di +.kp*need \\n[dn] +.ev nf +.kp*div +.ev +.rm kp*div +.. +.\" Floating keeps. +.nr kp@head 0 +.nr kp@tail 0 +.\" end floating keep +.de kp*fend +.br +.ev +.di +.ie \\n[.t]-(\\n[.k]>0*1v)>\\n[dn] \{\ +. br +. ev nf +. kp*fdiv +. rm kp*fdiv +. ev +.\} +.el \{\ +. rn kp*fdiv kp*div!\\n[kp@tail] +. nr kp*ht!\\n[kp@tail] 0\\n[dn] +. nr kp@tail +1 +.\} +.. +.\" top of page processing for KF +.nr kp*doing-top 0 +.de kp@top-hook +.if !\\n[kp*doing-top] \{\ +. nr kp*doing-top 1 +. kp*do-top +. nr kp*doing-top 0 +.\} +.. +.de kp*do-top +.\" If the first keep won't fit, only force it out if we haven't had a footnote +.\" and we're at the top of the page. +.nr kp*force \\n[pg@fn-flag]=0&(\\n[nl]<=\\n[pg@header-bottom]) +.nr kp*fits 1 +.while \\n[kp@tail]>\\n[kp@head]&\\n[kp*fits] \{\ +. ie \\n[.t]>\\n[kp*ht!\\n[kp@head]]:\\n[kp*force] \{\ +. nr kp*force 0 +. \" It's important to advance kp@head before bringing +. \" back the keep, so that if the last line of the +. \" last keep springs the bottom of page trap, a new +. \" page will not be started unnecessarily. +. rn kp*div!\\n[kp@head] kp*temp +. nr kp@head +1 +. ev nf +. kp*temp +. ev +. rm kp*temp +. \} +. el .nr kp*fits 0 +.\} +.. +.\" *************************** +.\" ******** module ds ******** +.\" *************************** +.\" Displays and non-floating keeps. +.de DE +.ds*end!\\n[\\n[.ev]:ds-type] +.nr \\n[.ev]:ds-type 0 +.. +.de ds@auto-end +.if \\n[\\n[.ev]:ds-type] \{\ +. @error automatically terminating display +. DE +.\} +.. +.de @div-end!ds*div +.ie \\n[\\n[.ev]:ds-type] .DE +.el .ds*end!2 +.. +.de ds*end!0 +.@error DE without DS, ID, CD, LD or BD +.. +.de LD +.br +.nr \\n[.ev]:ds-type 1 +.par@reset +.nf +.sp \\n[DD]u +.. +.de ID +.LD +.ie \\n[.$] .in +(n;\\$1) +.el .in +\\n[DI]u +.. +.de CD +.LD +.ce 9999 +.. +.de RD +.LD +.rj 9999 +.. +.de ds*common-end +.par@reset +.sp \\n[DD]u +.. +.als ds*end!1 ds*common-end +.de BD +.LD +.nr \\n[.ev]:ds-type 2 +.di ds*div +.. +.de ds*end!2 +.br +.ie '\\n(.z'ds*div' \{\ +. di +. nf +. in (u;\\n[.l]-\\n[dl]/2) +. ds*div +. rm ds*div +. ds*common-end +.\} +.el .@error-recover mismatched DE +.. +.de DS +.br +.di ds*div +.ie '\\$1'B' \{\ +. LD +. nr \\n[.ev]:ds-type 4 +.\} +.el \{\ +. ie '\\$1'L' .LD +. el \{\ +. ie '\\$1'C' .CD +. el \{\ +. ie '\\$1'R' .RD +. el \{\ +. ie '\\$1'I' .ID \\$2 +. el .ID \\$1 +. \} +. \} +. \} +. nr \\n[.ev]:ds-type 3 +.\} +.. +.de ds@need +.if '\\n(.z'' \{\ +. while \\n[.t]<=(\\$1)&(\\n[nl]>\\n[pg@header-bottom]) \{\ +. rs +' sp \\n[.t]u +. \} +.\} +.. +.de ds*end!3 +.br +.ie '\\n(.z'ds*div' \{\ +. di +. ds@need \\n[dn] +. ev nf +. ds*div +. ev +. rm ds*div +. ds*common-end +.\} +.el .@error-recover mismatched DE +.. +.de ds*end!4 +.ie '\\n(.z'ds*div' \{\ +. br +. di +. nf +. in (u;\\n[.l]-\\n[dl]/2) +. ds@need \\n[dn] +. ds*div +. rm ds*div +. ds*common-end +.\} +.el .@error-recover mismatched DE +.. +.\" **************************** +.\" ******** module par ******** +.\" **************************** +.\" Paragraph-level formatting. +.nr VARPS 0 +.nr PS 10 +.nr LL 6i +.de par*vs +.\" If it's too big to be in points, treat it as units. +.ie (p;\\$1)>=40p .vs (u;\\$1) +.el .vs (p;\\$1) +.. +.de par@ab-indent +.nr 0:li (u;\\n[LL]/12) +.nr 0:ri \\n[0:li] +.. +.de par*env-init +.aln \\n[.ev]:PS PS +.aln \\n[.ev]:VS VS +.aln \\n[.ev]:LL LL +.aln \\n[.ev]:MCLL LL +.aln \\n[.ev]:LT LT +.aln \\n[.ev]:MCLT LT +.aln \\n[.ev]:PI PI +.aln \\n[.ev]:PD PD +.par@reset-env +.. +.\" happens when the first page begins +.de par@init +.if !rLT .nr LT \\n[LL] +.if !rFL .nr FL \\n[LL]*5/6 +.if !rVS .nr VS \\n[PS]+2 +.ps \\n[PS] +.if !rDI .nr DI .5i +.if !rQI .nr QI 5n +.if !rPI .nr PI 5n +.par*vs \\n[VS] +.if !rPD .nr PD .3v +.if !rDD .nr DD .5v +.if !dFAM .ds FAM \\n[.fam] +.nr par*adj \\n[.j] +.par*env-init +.ev h +.par*env-init +.ev +.ev fn +.par*env-init +.ev +.ev k +.par*env-init +.ev +.aln 0:MCLL pg@colw +.aln 0:MCLT pg@colw +.aln k:MCLL pg@colw +.aln k:MCLT pg@colw +.if !rFPS .nr FPS \\n[PS]-2 +.if !rFVS .nr FVS (p;\\n[FPS]+2) +.if !rFI .nr FI 2n +.if !rFPD .nr FPD \\n[PD]/2 +.aln fn:PS FPS +.aln fn:VS FVS +.aln fn:LL FL +.aln fn:LT FL +.aln fn:PI FI +.aln fn:PD FPD +.aln fn:MCLL pg@fn-colw +.aln fn:MCLT pg@fn-colw +.. +.de par@reset-env +.nr \\n[.ev]:il 0 +.nr \\n[.ev]:li 0 +.nr \\n[.ev]:ri 0 +.nr \\n[.ev]:ai \\n[\\n[.ev]:PI] +.nr \\n[.ev]:pli 0 +.nr \\n[.ev]:pri 0 +.nr \\n[.ev]:ds-type 0 +.. +.\" par@reset +.de par@reset +.br +.ce 0 +.rj 0 +.ul 0 +.fi +.ad \\n[par*adj] +.ie \\n[pg@ncols]>1 \{\ +. ll (u;\\n[\\n[.ev]:MCLL]-\\n[\\n[.ev]:ri]-\\n[\\n[.ev]:pri]) +. lt \\n[\\n[.ev]:MCLT]u +.\} +.el \{\ +. ll (u;\\n[\\n[.ev]:LL]-\\n[\\n[.ev]:ri]-\\n[\\n[.ev]:pri]) +. lt \\n[\\n[.ev]:LT]u +.\} +.in (u;\\n[\\n[.ev]:li]+\\n[\\n[.ev]:pli]) +.ft 1 +.fam \\*[FAM] +.ps \\n[\\n[.ev]:PS] +.par*vs \\n[\\n[.ev]:VS] +.ls 1 +.TA +.hy 14 +.. +.als @RT par@reset +.\" This can be redefined by the user. +.de TA +.ta T 5n +.. +.de par*start +.ds@auto-end +.nr \\n[.ev]:pli \\$1 +.nr \\n[.ev]:pri \\$2 +.par@reset +.sp \\n[\\n[.ev]:PD]u +.ne 1v+\\n(.Vu +.. +.de par@finish +.nr \\n[.ev]:pli 0 +.nr \\n[.ev]:pri 0 +.par@reset +.. +.\" normal LP +.de @LP +.par*start 0 0 +.nr \\n[.ev]:ai \\n[\\n[.ev]:PI] +.. +.de @PP +.par*start 0 0 +.nr \\n[.ev]:ai \\n[\\n[.ev]:PI] +.ti +\\n[\\n[.ev]:ai]u +.. +.de @QP +.nr \\n[.ev]:ai \\n[\\n[.ev]:PI] +.par*start \\n[QI] \\n[QI] +.. +.de @XP +.par*start \\n[\\n[.ev]:PI] 0 +.ti -\\n[\\n[.ev]:PI]u +.. +.de @IP +.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) +.par*start \\n[\\n[.ev]:ai] 0 +.if !'\\$1'' \{\ +. \" Divert the label so as to freeze any spaces. +. di par*label +. in 0 +. nf +\&\\$1 +. di +. in +. fi +. chop par*label +. ti -\\n[\\n[.ev]:ai]u +. ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c +. el \{\ +\\*[par*label] +. br +. \} +. rm par*label +.\} +.. +.de @RS +.br +.nr \\n[.ev]:li!\\n[\\n[.ev]:il] \\n[\\n[.ev]:li] +.nr \\n[.ev]:ri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ri] +.nr \\n[.ev]:ai!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ai] +.nr \\n[.ev]:pli!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pli] +.nr \\n[.ev]:pri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pri] +.nr \\n[.ev]:il +1 +.nr \\n[.ev]:li +\\n[\\n[.ev]:ai] +.nr \\n[.ev]:ai \\n[\\n[.ev]:PI] +.par@reset +.. +.de @RE +.br +.ie \\n[\\n[.ev]:il] \{\ +. nr \\n[.ev]:il -1 +. nr \\n[.ev]:ai \\n[\\n[.ev]:ai!\\n[\\n[.ev]:il]] +. nr \\n[.ev]:li \\n[\\n[.ev]:li!\\n[\\n[.ev]:il]] +. nr \\n[.ev]:ri \\n[\\n[.ev]:ri!\\n[\\n[.ev]:il]] +. nr \\n[.ev]:pli \\n[\\n[.ev]:pli!\\n[\\n[.ev]:il]] +. nr \\n[.ev]:pri \\n[\\n[.ev]:pri!\\n[\\n[.ev]:il]] +.\} +.el .@error unbalanced \\$0 +.par@reset +.. +.\" --------------------------------------------------------------------------- +.de LINE +. br +. ps 32 +\l'\\n[.l]u-\\n[\\n[.ev]:ri]u-\\n[\\n[.ev]:pri]u' +. ps +.. +.\" --------------------------------------------------------------------------- +.de QSTART +. nr SaveQI \\n[QI] +. if \\n[.$] .nr QI \\$1 +. QS +. LINE +. ft 3 +.. +.\" --------------------------------------------------------------------------- +.de QEND +. ft P +. sp -.5 +. LINE +. QE +. nr QI \\n[SaveQI] +. if \\n[.$] .sp \\$1 +.. +.de @QS +.br +.nr \\n[.ev]:li!\\n[\\n[.ev]:il] \\n[\\n[.ev]:li] +.nr \\n[.ev]:ri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ri] +.nr \\n[.ev]:ai!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ai] +.nr \\n[.ev]:pli!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pli] +.nr \\n[.ev]:pri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pri] +.nr \\n[.ev]:il +1 +.nr \\n[.ev]:li +\\n[QI] +.nr \\n[.ev]:ri +\\n[QI] +.nr \\n[.ev]:ai \\n[\\n[.ev]:PI] +.par@reset +.. +.als @QE @RE +.\" start boxed text +.de B1 +.br +.di par*box-div +.nr \\n[.ev]:li +1n +.nr \\n[.ev]:ri +1n +.par@reset +.. +.de @div-end!par*box-div +.B2 +.. +.\" end boxed text +.\" Postpone the drawing of the box until we're in the top-level diversion, +.\" in case there's a footnote inside the box. +.de B2 +.ie '\\n(.z'par*box-div' \{\ +. br +. di +. ds@need \\n[dn] +. par*box-mark-top +. ev nf +. par*box-div +. ev +. nr \\n[.ev]:ri -1n +. nr \\n[.ev]:li -1n +. par@finish +. par*box-draw \\n[.i]u \\n[.l]u +.\} +.el .@error B2 without B1 +.. +.de par*box-mark-top +.ie '\\n[.z]'' .mk par*box-top +.el \!.par*box-mark-top +.. +.de par*box-draw +.ie '\\n[.z]'' \{\ +. nr par*box-in \\n[.i] +. nr par*box-ll \\n[.l] +. nr par*box-vpt \\n[.vpt] +. vpt 0 +. in \\$1 +. ll \\$2 +\v'-1v+.25m'\ +\D'l (u;\\n[.l]-\\n[.i]) 0'\ +\D'l 0 |\\n[par*box-top]u'\ +\D'l -(u;\\n[.l]-\\n[.i]) 0'\ +\D'l 0 -|\\n[par*box-top]u' +. br +. sp -1 +. in \\n[par*box-in]u +. ll \\n[par*box-ll]u +. vpt \\n[par*box-vpt] +.\} +.el \!.par*box-draw \\$1 \\$2 +.. +.de @SH +.par@finish +.\" Keep together the heading and the first two lines of the next paragraph. +.\" XXX - fix for variable PS. +.ne 3v+\\n[\\n[.ev]:PD]u+\\n(.Vu +.sp 1 +.ft 3 +.if \\n[VARPS] .ps \\n[PS]+2 +.. +.\" TL, AU, and AI are aliased to these in cov*ab-init. +.de par@TL +.par@finish +.sp 1 +.ft 3 +.ps +2 +.vs +3p +.ce 9999 +.. +.de par@AU +.par@finish +.sp 1 +.ft I +.ce 9999 +.. +.de par@AI +.par@finish +.sp .5 +.ce 9999 +.. +.\" In paragraph macros. +.de NL +.ps \\n[\\n[.ev]:PS] +.. +.de SM +.ps -2 +.. +.de LG +.ps +2 +.. +.de R +.ft R +.. +.de par*set-font +.ie \\n[.$] \{\ +. nr par*prev-font \\n[.f] +\&\\$3\f[\\*[par*font-name!\\$0]]\\$1\f[\\n[par*prev-font]]\\$2 +.\} +.el .ft \\*[par*font-name!\\$0] +.. +.ds par*font-name!B 3 +.ds par*font-name!I 2 +.ds par*font-name!BI BI +.ds par*font-name!CW CR +.als B par*set-font +.als I par*set-font +.als BI par*set-font +.als CW par*set-font +.\" underline a word +.de UL +\Z'\\$1'\v'.25m'\D'l \w'\\$1'u 0'\v'-.25m'\\$2 +.. +.\" box a word +.de BX +.nr par*bxw \w'\\$1'+.4m +\Z'\v'.25m'\D'l 0 -1m'\D'l \\n[par*bxw]u 0'\D'l 0 1m'\D'l -\\n[par*bxw]u 0''\ +\Z'\h'.2m'\\$1'\ +\h'\\n[par*bxw]u' +.. +.\" The first time UX is used, put a registered mark after it. +.ds par*ux-rg \(rg +.de UX +\s[\\n[.s]*8u/10u]UNIX\s0\\$1\\*[par*ux-rg] +.ds par*ux-rg +.. +.ds par@sup-start \v'-.9m\s'\En[.s]*7u/10u'+.7m' +.als { par@sup-start +.ds par@sup-end \v'-.7m\s0+.9m' +.als } par@sup-end +.\" footnote paragraphs +.\" FF is the footnote format +.nr FF 0 +.\" This can be redefined. It gets a second argument of `no' if the first +.\" argument was supplied by the user, rather than automatically. +.de FP +.br +.if !d par*fp!\\n[FF] \{\ +. @error unknown footnote format `\\n[FF]' +. nr FF 0 +.\} +.ie '\\$2'no' .par*fp!\\n[FF]-no "\\$1" +.el .par*fp!\\n[FF] "\\$1" +.. +.de par*fp!0 +.@PP +\&\\*[par@sup-start]\\$1\\*[par@sup-end]\ \c +.. +.de par*fp!0-no +.@PP +\&\\$1\ \c +.. +.de par*fp!1 +.@PP +\&\\$1.\ \c +.. +.de par*fp!1-no +.@PP +\&\\$1\ \c +.. +.de par*fp!2 +.@LP +\&\\$1.\ \c +.. +.de par*fp!2-no +.@LP +\&\\$1\ \c +.. +.de par*fp!3 +.@IP "\\$1." (u;\\n[\\n[.ev]:PI]*2) +.. +.de par*fp!3-no +.@IP "\\$1" (u;\\n[\\n[.ev]:PI]*2) +.. +.\" *************************** +.\" ******** module nh ******** +.\" *************************** +.\" Numbered headings. +.\" nh*hl is the level of the last heading +.nr nh*hl 0 +.\" numbered heading +.de @NH +.ie '\\$1'S' \{\ +. shift +. nr nh*hl 0 +. while \\n[.$] \{\ +. nr nh*hl +1 +. nr H\\n[nh*hl] 0\\$1 +. shift +. \} +. if !\\n[nh*hl] \{\ +. nr H1 1 +. nr nh*hl 1 +. @error missing arguments to .NH S +. \} +.\} +.el \{\ +. nr nh*ohl \\n[nh*hl] +. ie \\n[.$] \{\ +. nr nh*hl 0\\$1 +. ie \\n[nh*hl]<=0 \{\ +. nr nh*ohl 0 +. nr nh*hl 1 +. \} +. el \{\ +. if \\n[nh*hl]-\\n[nh*ohl]>1 \ +. @warning .NH \\n[nh*ohl] followed by .NH \\n[nh*hl] +. \} +. \} +. el .nr nh*hl 1 +. while \\n[nh*hl]>\\n[nh*ohl] \{\ +. nr nh*ohl +1 +. nr H\\n[nh*ohl] 0 +. \} +. nr H\\n[nh*hl] +1 +.\} +.ds SN +.nr nh*i 0 +.while \\n[nh*i]<\\n[nh*hl] \{\ +. nr nh*i +1 +. as SN \\n[H\\n[nh*i]]. +.\} +.SH +.if \\n[VARPS] \{\ +. ps \\n[PS]+2 +. ne 3 +.\} +\\*[SN] +.. +.de VARPS +.nr VARPS 1 +.. +.\" **************************** +.\" ******** module toc ******** +.\" **************************** +.\" Table of contents generation. +.de XS +.da toc*div +.ev h +.par@reset +.fi +.ie \\n[.$] .XA "\\$1" +.el .XA +.. +.de @div-end!toc*div +.XE +.. +.de XA +.ie '\\n(.z'toc*div' \{\ +. if d toc*num .toc*end-entry +. ie \\n[.$] \{\ +. ie '\\$1'no' .ds toc*num +. el .ds toc*num "\\$1 +. \} +. el .ds toc*num \\n[PN] +. in (n;0\\$2) +.\} +.el .@error XA without XS +.. +.de XE +.ie '\\n(.z'toc*div' \{\ +. if d toc*num .toc*end-entry +. ev +. di +.\} +.el .@error XS without XE +.. +.de toc*end-entry +\\a\\t\\*[toc*num] +.br +.rm toc*num +.. +.de PX +.1C +.if !'\\$1'no' \{\ +. ce 1 +. ps \\n[PS]+2 +. ft 3 +\\*[TOC] +. ft +. ps +.\} +.nf +.char \[toc*leader-char] .\h'1m' +.lc \[toc*leader-char] +.ta (u;\\n[.l]-\\n[.i]-\w'000') (u;\\n[.l]-\\n[.i])R +.sp 2 +.toc*div +.par@reset +.. +.\" print the table of contents on page i +.de TC +.P1 +.pg@begin 1 i +.PX \\$1 +.. +.\" **************************** +.\" ******** module eqn ******** +.\" **************************** +.\" Eqn support. +.de EQ +.. +.de EN +.. +.de @EQ +.br +.ds eqn*num "\\$2 +.ie '\\$1'L' .nr eqn*type 0 +.el \{\ +. ie '\\$1'I' .nr eqn*type 1 +. el \{\ +. nr eqn*type 2 +. if !'\\$1'C' .ds eqn*num "\\$1 +. \} +.\} +.di eqn*div +.in 0 +.nf +.. +.de @div-end!eqn*div +.@EN +.. +.\" Note that geqn mark and lineup work correctly in centered equations. +.de @EN +.ie !'\\n(.z'eqn*div' .@error-recover mismatched EN +.el \{\ +. br +. di +. nr eqn*have-num 0 +. if !'\\*[eqn*num]'' .nr eqn*have-num 1 +. if \\n[dl]:\\n[eqn*have-num] \{\ +. sp \\n[DD]u +. par@reset +. ds eqn*tabs \\n[.tabs] +. nf +. ie \\n[dl] \{\ +. ds@need \\n[dn]u-1v+\n[.V]u +. chop eqn*div +. ie \\n[eqn*type]=0 \{\ +. ta (u;\\n[.l]-\\n[.i])R +\\*[eqn*div]\t\\*[eqn*num] +. \} +. el \{\ +. ie \\n[eqn*type]=1 .ta \\n[DI]u \ +(u;\\n[.l]-\\n[.i])R +. el .ta (u;\\n[.l]-\\n[.i]/2)C \ +(u;\\n[.l]-\\n[.i])R +\t\\*[eqn*div]\t\\*[eqn*num] +. \} +. \} +. el \{\ +. ta (u;\\n[.l]-\\n[.i])R +\t\\*[eqn*num] +. \} +. sp \\n[DD]u +. fi +. ta \\*[eqn*tabs] +. \} +.\} +.. +.\" **************************** +.\" ******** module tbl ******** +.\" **************************** +.\" Tbl support. +.nr tbl*have-header 0 +.de TS +.\" The break is necessary in the case where the first page has not yet begun. +.br +.sp \\n[DD]u +.if '\\$1'H' .di tbl*header-div +.. +.de tbl@top-hook +.if \\n[tbl*have-header] \{\ +. ie \\n[.t]-\\n[tbl*header-ht]-1v .tbl*print-header +. el .sp \\n[.t]u +.\} +.. +.de tbl*print-header +.ev nf +.tbl*header-div +.ev +.mk #T +.. +.de TH +.ie '\\n[.z]'tbl*header-div' \{\ +. nr T. 0 +. T# +. br +. di +. ie \\n[dn]+\\n[FM]+\\n[HM]+2v>=\\n[.p] \{\ +. @error ridiculously long table header +. ds@need \\n[dn] +. tbl*print-header +. \} +. el \{\ +. nr tbl*header-ht \\n[dn] +. ds@need \\n[dn]u+1v +. tbl*print-header +. nr tbl*have-header 1 +. \} +.\} +.el .@error-recover .TH without .TS H +.. +.de @div-end!tbl*header-div +.TH +.TE +.. +.de TE +.ie '\\n(.z'tbl*header-div' .@error-recover .TS H but no .TH before .TE +.el \{\ +. nr tbl*have-header 0 +. sp \\n[DD]u +.\} +.\" reset tabs +.TA +.. +.de tbl@bottom-hook +.if \\n[tbl*have-header] \{\ +. nr T. 1 +. T# +.\} +.. +.de T& +.. +.\" **************************** +.\" ******** module pic ******** +.\" **************************** +.\" Pic support. +.\" PS height width +.de PS +.br +.sp \\n[DD]u +.ie \\n[.$]<2 .@error bad arguments to PS (not preprocessed with pic?) +.el \{\ +. ds@need (u;\\$1)+1v +. in +(u;\\n[.l]-\\n[.i]-\\$2/2>?0) +.\} +.. +.de PE +.par@reset +.sp \\n[DD]u+.5m +.. +.\" **************************** +.\" ******** module ref ******** +.\" **************************** +.\" Refer support. +.de ]- +.rm [A [B [C [D [E [G [I [J [N [O [P [Q [R [S [T [V +.rm ref*string +.. +.\" Other +.ds ref*spec!0 Q A T S V N P I C D O +.\" Journal article +.ds ref*spec!1 Q A T J S V N P I C D O +.\" Book +.ds ref*spec!2 Q A T S V P I C D O +.\" Article within book +.ds ref*spec!3 Q A T B E S V P I C D O +.\" Tech report +.ds ref*spec!4 Q A T R G P I C D O +.\" ][ type +.de ][ +.ie d ref*spec!\\$1 .ref*build \\*[ref*spec!\\$1] +.el \{\ +. @error unknown reference type `\\$1' +. ref*build \\*[ref*spec!0] +.\} +.ref*print +.rm ref*string +.rm [F +.. +.\" start of reference number +.ds [. \\*[par@sup-start] +.\" end of reference number +.ds .] \\*[par@sup-end] +.\" period before reference +.ds <. . +.\" period after reference +.ds >. \" empty +.\" comma before reference +.ds <, , +.\" comma after reference +.ds >, \" empty +.\" start collected references +.de ]< +.als ref*print ref*end-print +.SH +\&\\*[REFERENCES] +.par@reset +.. +.\" end collected references +.de ]> +.par@finish +.als ref*print ref*normal-print +.. +.de ref*normal-print +.ie d [F .FS "\\*([.\\*([F\\*(.]" +.el .FS \& +\\*[ref*string] +.FE +.. +.de ref*end-print +.ie d [F .IP "\\*([F." +.el .XP +\\*[ref*string] +.. +.als ref*print ref*normal-print +.de ref*build +.rm ref*string ref*post-punct +.nr ref*suppress-period 1 +.while \\n[.$] \{\ +. if d [\\$1 \{\ +. ie d ref*add-\\$1 .ref*add-\\$1 +. el .ref*add-dflt \\$1 +. \} +. shift +.\} +.\" now add a final period +.ie d ref*string \{\ +. if !\\n[ref*suppress-period] .as ref*string . +. if d ref*post-punct \{\ +. as ref*string "\\*[ref*post-punct] +. rm ref*post-punct +. \} +.\} +.el .ds ref*string +.. +.de ref*add-T +.ref*field T , "\\*Q" "" "\\*U" +.if r [T .nr ref*suppress-period \\n([T +.. +.de ref*add-P +.ie \\n([P>0 .ref*field P , "pp. " +.el .ref*field P , "p. " +.. +.de ref*add-J +.ref*field J , \f2 "" \fP +.. +.de ref*add-D +.ref*field D "" ( ) +.. +.de ref*add-E +.ref*field E , "ed. " +.. +.de ref*add-G +.ref*field G "" ( ) +.. +.de ref*add-B +.ref*field B "" "in \f2" "" \fP +.. +.de ref*add-O +.ref*field O . +.ie r [O .nr ref*suppress-period \\n([O +.el .nr ref*suppress-period 1 +.. +.de ref*add-A +.ref*field A , +.if r [A .nr ref*suppress-period \\n([A +.. +.de ref*add-dflt +.ref*field \\$1 , +.. +.\" First argument is the field letter. +.\" Second argument is the punctuation character to use to separate this field +.\" from the previous field. +.\" Third argument is a string with which to prefix this field. +.\" Fourth argument is a string with which to postfix this field. +.\" Fifth argument is a string to add after the punctuation character supplied +.\" by the next field. +.de ref*field +.if d ref*string \{\ +. ie d ref*post-punct \{\ +. as ref*string "\\$2\\*[ref*post-punct] \" +. rm ref*post-punct +. \} +. el .as ref*string "\\$2 \" +.\} +.as ref*string "\\$3\\*([\\$1\\$4 +.if \\n[.$]>4 .ds ref*post-punct "\\$5 +.nr ref*suppress-period 0 +.. +.\" **************************** +.\" ******** module acc ******** +.\" **************************** +.\" Accents and special characters. +.ds Q \)``\) +.ds U \)''\) +.ds - \(em +.\" Characters +.if !c\(rg .char \(rg (R) +.if !c\(ah .char \(ah \v'-.55m'\s[\En[.s]/2u]v\s0\v'.55m' +.if !c\(ad .char \(ad \v'-.55m'\s[\En[.s]*7u/10u].\h'.05m'.\s0\v'.55m' +.if !c\(a- .char \(a- \v'-.55m'\D'l .25m 0'\v'.55m' +.if !c\(ao .char \(ao \v'-.55m'\s[\En[.s]*6u/10u]\D'c .25m'\s0\v'.55m' +.if !c\(ac .char \(ac \s[\En[.s]*8u/10u]\v'.05m',\v'-.05m'\s0 +.if !c\(ho .char \(ho \s[\En[.s]/2u]\v'.4m'c\v'-.4m'\s0 +.if !c\(-D .char \(-D \Z'\v'-.1m'-'D +.if !c\(Sd .char \(Sd \Z'\v'-.3m'\h'.2m'-'\(pd +.if !c\(TP .char \(TP I\h'-.25m'\v'-.33m'\s[\En[.s]*6u/10u]\v'.33m'D\ +\v'-.33m'\s0\v'.33m' +.if !c\(Tp .char \(Tp \zlp +.if !c\(ss .char \(ss \(*b +.if !c\(AE .char \(AE A\h'-.3m'E +.if !c\(ae .char \(ae a\h'-.19m'e +.if !c\(OE .char \(OE O\h'-.25m'E +.if !c\(oe .char \(oe o\h'-.14m'e +.if !c\(r? .char \(r? \Z'\h'.1m'\v'-.15m'\s[\En[.s]*7u/10u]i\s0\v'.15m''\ +\v'.15m'\s[\En[.s]*7u/10u]c\s0\v'-.15m' +.if !c\(r! .char \(r! \h'.1m'\Z'\v'-.4m'\s[\En[.s]*8u/10u].\s0\v'.4m''\ +\s[\En[.s]*8u/10u]\v'.4m'\(or\v'-.4m'\s0\h'.1m' +.\" The idea of this definition is for the top of the 3 to be at the x-height. +.\" A yogh really ought to have a little line going north-west from the top +.\" left of the 3. +.if !c\[yogh] .char \[yogh] \Z'\v'\w'x'*0-\En[rst]u'\s[\En[.s]*8u/10u]\ +\v'\w'3'*0+\En[rst]u'3\s0'\h'\w'\s[\En[.s]*8u/10u]3'u' +.\" Accents +.de acc*over-def +.ds \\$1 \Z'\v'(u;\w'x'*0+\En[rst]-\En[.cht])'\ +\h'(u;-\En[skw]+(-\En[.w]-\w'\\$2'/2)+\En[.csk])'\\$2' +.. +.de acc*under-def +.ds \\$1 \Z'\v'\En[.cdp]u'\h'(u;-\En[.w]-\w'\\$2'/2)'\\$2' +.. +.de acc*slash-def +.ds \\$1 \Z'\h'(u;-\En[.w]-\w'\\$2'/2)'\ +\v'(u;\En[.cdp]-\En[.cht]+\En[rst]+\En[rsb]/2)'\\$2' +.. +.de acc*prefix-def +.ds \\$1 \Z'\h'(u;\w'x'-\w'\\$2'/2)'\\$2' +.. +.acc*prefix-def ' \' +.acc*prefix-def ` \` +.acc*prefix-def ^ ^ +.acc*prefix-def , \(ac +.acc*prefix-def : \(ad +.acc*prefix-def ~ ~ +.\" improved accent marks +.de AM +.acc*over-def ' \' +.acc*over-def ` \` +.acc*over-def ^ ^ +.acc*over-def ~ ~ +.acc*over-def : \(ad +.acc*over-def v \(ah +.acc*over-def _ \(a- +.acc*over-def o \(ao +.acc*under-def , \(ac +.acc*under-def . \s[\En[.s]*8u/10u]\v'.2m'.\v'-.2m'\s0 +.acc*under-def hook \(ho +.acc*slash-def / / +.char \[hooko] o\\\\*[hook] +.ds q \[hooko] +.ds 3 \[yogh] +.ds D- \(-D\" Icelandic uppercase eth +.ds d- \(Sd\" Icelandic lowercase eth +.ds Th \(TP\" Icelandic uppercase thorn +.ds th \(Tp\" Icelandic lowercase thorn +.ds 8 \(ss\" German double s +.ds Ae \(AE\" AE ligature +.ds ae \(ae\" ae ligature +.ds Oe \(OE\" OE ligature +.ds oe \(oe\" oe ligature +.ds ? \(r?\" upside down ? +.ds ! \(r!\" upside down ! +.. +.\" Make sure that no blank lines creep in at the end of this file. diff --git a/performance/lmbench3/doc/usenix.ol b/performance/lmbench3/doc/usenix.ol new file mode 100644 index 0000000..e3f2796 --- /dev/null +++ b/performance/lmbench3/doc/usenix.ol @@ -0,0 +1,102 @@ +Introduction + What is it? + A bunch of speed of light benchmarks, + not MP, not throughput, not saturation, not stress tests. + A microbenchmark suite + Measures system performance + Latency and bandwidth measurements + Measurements focus on OS and hardware + What is delivered to the application + Not marketing numbers + Benchmark performance predicts application performance + Results for which systems? + Sun, SGI, DEC, IBM, HP, PCs + Useful information to whom? + Performance engineers, system programmers, system architects. +Motivation + What are we measuring? + Control / latecy operatins + Bandwidth operations + What aren't we measuring? + Basic MIPS & MFLOPS. XXX - not unless I do it right. + What can I learn? + Cost of operations + ****Operations per time unit**** + Compare speed of alternative paths (e.g. mmap vs. read) + Performance problems = f(bw issues + latency issues) + Give at least two examples + NFS control & data: UDP lat, proc lat, & various BW metrics + Oracle lock manager: TCP lat + Verilog: mem lat + AIM: fs ops XXX -ask Scott about pipes. + Knowing the speeds of primitives can provide speeds of apps. + An example here would be nice. +Outline + Describe benchmark + Give results from current machines + Discuss results + Future changes, enhancements, etc. +Tutorial on benchmarks + For each metric + what is it? + why is it being measured? + How is it measured? + Measuring subtlities + Interpreting the results +Latency + Process stuff + networking stuff + file system stuff + memory stuff + whatever +Bandwidth + networking + file system + memory +Results + Tabular results - XXX update that table to reflect the newer metrics + Graphs of memory latency & context switches + Discussion + Memory stuff + Maybe contrast AIX with the $100K IBM + uniprocessor w/ killer memory perf and point out + that it is the memory that is making AIX go + fast, it certainly isn't AIX. A more politic + observation would be that systems with good + memory performace tend to have good system + performance; the point being to shift people's + attention to system performance, especially + memory subsystem, as opposed to processor mips. + Comparisons + Maybe look at the table and draw attention to + really good and really bad numbers for various + platforms (like Linux' context switch time, + Linux fs ops, solaris syscall, process stuff, + 990 memory BW). +Graphs + A graph showing a range of really fast to really slow ops, all on the + same graph. Do bandwidth stuff normalized on MB/sec. + Carl sez: show both ops/sec and cost/op on two graphs. + A graph showing processor slow down due to memory misses, assuming + each instruction misses. Maybe a graph that shows # of clocks + (or better yet, # of instructions - think super scalar) that you would + have to have between each memory miss in order to run at the clock + speed. +War stories + Sun page coloring bug + SGI page coloring bug + SGI hippi bug - XXX ask Thomas + Sun bcopy bug +Lmbench [optional?] + how to get lmbench + how to compile + how to run + how to show results +Future work + More hardware stuff - better latency measurements (write lat, + cache to cache latency). + add throughput & saturation measurements +TODO + get some similar papers for comparison + Someday I need reasonable I/O benchmarks to show off good + big SMP machines like Challenge. diff --git a/performance/lmbench3/doc/usenix96.ms b/performance/lmbench3/doc/usenix96.ms new file mode 100644 index 0000000..ca46fd4 --- /dev/null +++ b/performance/lmbench3/doc/usenix96.ms @@ -0,0 +1,1798 @@ +.\" This document is GNU groff -mgs -t -p -R -s +.\" It will not print with normal troffs, it uses groff features, in particular, +.\" long names for registers & strings. +.\" Deal with it and use groff - it makes things portable. +.\" +.\" $X$ xroff -mgs -t -p -R -s $file +.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more +.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr +.VARPS +.\" Define a page top that looks cool +.\" HELLO CARL! To turn this off, s/PT/oldPT/ +.de draftPT +.\" .tl '\fBDRAFT\fP'Printed \\*(DY'\fBDRAFT\fP' +.. +.de PT +.if \\n%>1 \{\ +. sp -.1i +. ps 14 +. ft 3 +. nr big 24 +. nr space \\w'XXX' +. nr titlewid \\w'\\*[title]' +. nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 +. ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' +. ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 +. ce 1 +\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] +. ps +. sp -.70 +. ps 12 +\\l'\\n[LL]u' +. ft +. ps +.\} +.. +.\" Define a page bottom that looks cool +.\" HELLO CARL! To turn this off, s/BT/oldBT/ +.de draftBT +.\" .tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP' +.. +.de BT +. ps 9 +\v'-1'\\l'\\n(LLu' +. sp -1 +. tl '\(co 1995 \\*[author]'\\*(DY'%' +. ps +.. +.de SP +. if t .sp .5 +. if n .sp 1 +.. +.de BU +. SP +. ne 2 +\(bu\ +. if \\n[.$] \fB\\$1\fP\\$2 +.. +.nr FIGURE 0 +.nr TABLE 0 +.nr SMALL .25i +.de TSTART +. KF +. if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 +. ps -1 +. vs -1 +.. +.de TEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr TABLE \\n[TABLE]+1 +. ce 1 +\fBTable \\n[TABLE].\ \ \\$1\fP +. SP +. KE +.. +.de FEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr FIGURE \\n[FIGURE]+1 +. ce 1 +\fBFigure \\n[FIGURE].\ \ \\$1\fP +. SP +. KE +.. +.\" Configuration +.nr PI 3n +.nr HM .95i +.nr FM 1i +.nr PO .95i +.if t .po .95i +.nr LL 6.5i +.if n .nr PO 0i +.if n .nr LL 7.75i +.nr PS 10 +.nr VS \n(PS+1 +.ds title Portable tools for performance analysis +.ds author Larry McVoy +.ds lmbench \f(CWlmbench\fP +.ds lmdd \f(CWlmdd\fP +.ds bcopy \f(CWbcopy\fP +.ds connect \f(CWconnect\fP +.ds execlp \f(CWexeclp\fP +.ds exit \f(CWexit\fP +.ds fork \f(CWfork\fP +.ds gcc \f(CWgcc\fP +.ds getpid \f(CWgetpid\fP +.ds getpid \f(CWgetpid\fP +.ds gettimeofday \f(CWgettimeofday\fP +.ds kill \f(CWkill\fP +.ds memmove \f(CWmemmove\fP +.ds mmap \f(CWmmap\fP +.ds popen \f(CWpopen\fP +.ds read \f(CWread\fP +.ds stream \f(CWstream\fP +.ds system \f(CWsystem\fP +.ds uiomove \f(CWuiomove\fP +.ds write \f(CWwrite\fP +.ds yield \f(CWyield\fP +.\" References stuff +.de RN \"Reference Name: .RN $1 -- prints the reference prettily +.\"[\s-2\\$1\s+2]\\$2 +[\s-1\\$1\s0]\\$2 +.. +.\" .R1 +.\" sort A+DT +.\" database references +.\" label-in-text +.\" label A.nD.y-2 +.\" bracket-label \*([. \*(.] ", " +.\" .R2 +.TL +\s(14lmbench: Portable tools for performance analysis\s0\** +.AU +\s+2\fR\*[author]\fP\s0 +.AI +\fI\s+2Silicon Graphics, Inc.\s0\fP +.AU +\s+2\fRCarl Staelin\fP +.AI +\s+2\fIHewlett-Packard Laboratories\s0\fP +.SP +.AB +\*[lmbench] is a micro-benchmark suite designed to focus +attention on the basic building blocks of many +common system applications, such as databases, simulations, +software development, and networking. In almost all +cases, the individual tests are the result of analysis and isolation +of a customer's actual performance problem. +.\" .SP +These tools can be, and currently are, used to compare different +system implementations from different vendors. +In several cases, +the benchmarks have uncovered previously unknown bugs and design flaws. +The results have shown a strong +correlation between memory system performance and overall performance. +.\" XXX - MP versus uniprocessors? +\*[lmbench] includes an extensible database of +results from systems current as of late 1995. +.AE +.if t .MC 3.05i +.FS +This paper first appeared in the January 1996 Usenix conference proceedings. +The version you are reading has new results as well as some corrections. +.FE +.NH 1 +Introduction +.PP +\*[lmbench] +provides a suite of benchmarks that attempt to measure the most commonly +found performance bottlenecks in a wide range of system applications. +These bottlenecks have been identified, isolated, and reproduced in a set +of small micro-benchmarks, which measure +system latency and bandwidth of data movement among +the processor and memory, network, file system, and disk. +The intent is to produce numbers that real +applications can reproduce, rather than the frequently +quoted and somewhat less reproducible marketing performance numbers. +.PP +The benchmarks focus on latency and bandwidth because +performance issues are usually caused by latency +problems, bandwidth problems, or some combination of the two. Each benchmark +exists because it captures some unique performance problem present in +one or more important applications. +For example, the TCP latency benchmark is an accurate predictor of the +Oracle distributed lock manager's performance, the memory latency +benchmark gives a strong indication of Verilog simulation performance, +and the file system latency benchmark models a critical path +in software development. +.PP +\*[lmbench] was developed to identify and evaluate system performance +bottlenecks present in many machines in 1993-1995. It is entirely +possible that computer architectures will have changed and advanced +enough in the next few years to render parts of this benchmark suite +obsolete or irrelevant. +.PP +\*[lmbench] is already in widespread use at many sites by both end +users and system designers. In some cases, \*[lmbench] has provided +the data necessary to discover and correct critical performance +problems that might have gone unnoticed. \*[lmbench] uncovered a +problem in Sun's memory management software +that made all pages map to the same location in the cache, effectively +turning a 512 kilobyte (K) cache into a 4K cache. +.PP +\*[lmbench] measures only a system's ability +to transfer data between processor, cache, memory, network, and disk. +It does not measure other parts of the system, such as the graphics subsystem, +nor is it a MIPS, MFLOPS, +throughput, saturation, stress, graphics, or multiprocessor test suite. +It is frequently run on multiprocessor (MP) systems to compare their performance +against +uniprocessor systems, but it does not take advantage of any multiprocessor +features. +.PP +The benchmarks are written using standard, portable +system interfaces and facilities commonly +used by applications, so +\*[lmbench] +is portable and comparable over a wide set of Unix systems. +\*[lmbench] has been run on +AIX, +BSDI, +HP-UX, +IRIX, +Linux, +FreeBSD, +NetBSD, +OSF/1, +Solaris, +and +SunOS. +Part of the suite has been run on Windows/NT as well. +.PP +\*[lmbench] +is freely distributed under +the Free Software Foundation's General Public License +.RN Stallman89 , +with the additional restriction +that results may be reported only if the benchmarks are unmodified. +.NH 1 +Prior work +.PP +Benchmarking and performance analysis is not a new endeavor. +There are too many other benchmark suites to list all of +them here. We compare \*[lmbench] +to a set of similar benchmarks. +.BU "I/O (disk) benchmarks" : +IOstone +.RN Park90 +wants to be an I/O benchmark, but actually measures the memory +subsystem; all of the tests fit easily in the cache. +IObench +.RN Wolman89 +is a systematic file system and disk benchmark, but it is +complicated and unwieldy. +In +.RN McVoy91 +we reviewed many I/O benchmarks and found them all +lacking because they took too long to run and +were too complex a solution to a fairly simple problem. We wrote a +small, simple I/O benchmark, \*[lmdd] that +measures sequential and random I/O far +faster than either IOstone or IObench. As part of +.RN McVoy91 +the results from \*[lmdd] were checked against IObench (as well as some other +Sun internal I/O benchmarks). \*[lmdd] proved to be more accurate than any +of the other benchmarks. +At least one disk vendor +routinely uses \*[lmdd] to do performance testing of its disk drives. +.SP +Chen and Patterson +.RN "Chen93, Chen94" +measure I/O performance under a +variety of workloads that are automatically varied to test the +range of the system's performance. +Our efforts differ in that we are more interested in the CPU overhead +of a single request, rather than the capacity of the system as a whole. +.BU "Berkeley Software Distribution's microbench suite" : +The BSD effort generated an extensive set of +test benchmarks to do regression testing (both quality and performance) +of the BSD releases. +We did not use this as a basis for our work (although we used ideas) +for the following reasons: +(a) missing tests \(em such as memory latency, +(b) too many tests, the results tended to be obscured under a mountain +of numbers, +and (c) wrong copyright \(em we wanted the +Free Software Foundation's General Public License. +.BU "Ousterhout's Operating System benchmark" : +.RN Ousterhout90 +proposes several system benchmarks to measure system call +latency, context switch time, and file system performance. +We used the same ideas as a basis for our work, while trying to +go farther. We measured a more complete set of +primitives, including some hardware measurements; went into greater depth +on some of the tests, such as context switching; and went to great +lengths to make the benchmark portable and extensible. +.BU "Networking benchmarks" : +\f(CWNetperf\fP measures networking bandwidth and latency and +was written by Rick Jones of Hewlett-Packard. +\*[lmbench] includes a smaller, +less complex benchmark that produces similar results. +.SP +\f(CWttcp\fP is a widely used benchmark in the Internet community. +Our version of the same benchmark +routinely delivers bandwidth numbers that are within 2% of the numbers +quoted by \f(CWttcp\fP. +.BU "McCalpin's stream benchmark" : +.RN McCalpin95 +has memory bandwidth measurements and results for a large number of +high-end systems. +We did not use these because we discovered them only after +we had results using our versions. +We will probably include McCalpin's benchmarks in \*[lmbench] +in the future. +.PP +In summary, we rolled our own because we wanted simple, portable +benchmarks that accurately measured a wide variety of operations that we +consider crucial to performance on today's systems. While portions of +other benchmark suites include similar work, none includes all of it, +few are as portable, and almost all are far more complex. Less filling, +tastes great. +.NH 1 +Benchmarking notes +.NH 2 +Sizing the benchmarks +.PP +The proper sizing of various benchmark parameters is crucial to ensure +that the benchmark is measuring the right component of system performance. +For example, memory-to-memory copy +speeds are dramatically affected by the location of the data: if +the size parameter is too small so +the data is in a cache, then the performance may be as much as ten times +faster than if the data is in memory. +On the other hand, if the memory size parameter is too big so the data +is paged to disk, then performance may be slowed to such an extent +that the benchmark seems to `never finish.' +.PP +\*[lmbench] takes the following approach to the cache and memory +size issues: +.BU +All of the benchmarks that could be affected +by cache size are run in a loop, +with increasing sizes (typically powers of two) until some maximum size +is reached. The results may then be plotted to see where the benchmark +no longer fits in the cache. +.BU +The benchmark verifies that there is sufficient memory to run all of the +benchmarks in main memory. A small test program allocates as much memory +as it can, clears the memory, +and then strides through that memory a page at a time, timing +each reference. If any reference takes more than a few microseconds, the +page is no longer in memory. The test program starts small and works forward +until either enough memory is seen as present or the memory limit is reached. +.NH 2 +Compile time issues +.PP +The GNU C compiler, \*[gcc], is the compiler we chose because +it gave the most reproducible results across platforms. +When \*[gcc] was not present, we used the vendor-supplied \f(CWcc\fP. +All of the benchmarks were compiled with optimization \f(CW-O\fP +except +the benchmarks that calculate clock speed and the context switch times, +which must be compiled without optimization in order to produce +correct results. No other optimization flags were enabled because +we wanted results that would be commonly seen by application writers. +.PP +All of the benchmarks were linked using the default manner of +the target system. For most if not all systems, the +binaries were linked using shared libraries. +.NH 2 +Multiprocessor issues +.PP +All of the multiprocessor systems ran the benchmarks in the same way as +the uniprocessor systems. Some systems allow users to pin processes +to a particular CPU, which sometimes results in better cache reuse. We +do not pin processes because it defeats the MP scheduler. +.\" XXX - I should do this on an IP19 and mark it as pinned. +In certain cases, this decision yields interesting results discussed later. +.NH 2 +Timing issues +.LP +.sp -.5 +.BU "Clock resolution" : +The benchmarks measure the elapsed time by reading the system clock via the +\*[gettimeofday] interface. On some systems this interface has a resolution +of 10 milliseconds, a long time relative to many of the benchmarks which +have results measured in tens to hundreds of microseconds. To compensate for +the coarse clock resolution, the benchmarks are hand-tuned to measure +many operations within a single time interval lasting for many clock ticks. +Typically, this is done by executing the operation in a small loop, sometimes +unrolled if the operation is exceedingly fast, and then dividing +the loop time by the loop count. +.BU Caching : +If the benchmark expects the data to be in the cache, the benchmark is +typically run several times; only the last result is recorded. +.SP +If the benchmark does not want to measure cache performance it sets +the size parameter larger than the cache. For example, the +\*[bcopy] benchmark by default copies 8 megabytes to 8 megabytes, +which largely defeats any second-level cache in use today. (Note that the +benchmarks are not trying to defeat the file or process page cache, +only the hardware caches.) +.br +.di bigtable +.ev keep +.ps 8 +.vs 9 +.so systems.tbl +.ps \n[PS] +.vs \n[VS] +.nr TABLE \n[TABLE]+1 +.ce 1 +.SP +\fBTable \n[TABLE].\ \ System descriptions.\fP +.SP +.di +.ev +.nr WHEN \n[dn]+\n[FM] +.nr THT \n[dn] +.de print*table +' sp .5 +' ev keep +' nf +' bigtable +. ne 1 +. wh -\n[WHEN]u skip*page +. fi +. ev +.. +.de skip*page +' sp \n[THT]u +. wh -\n[WHEN]u +.. +.wh -\n[WHEN]u print*table +.BU Variability : +The results of some benchmarks, most notably the context switch benchmark, had a tendency +to vary quite a bit, up to 30%. We suspect that the +operating system is not using the same set of physical +pages each time a process is created and we are seeing the effects of +collisions in the external caches. We compensate by running the +benchmark in a loop and taking the minimum result. Users interested in +the most accurate data are advised to verify the results on their +own platforms. +.PP +Many of the results included in the database were donated by users +and were not created by the authors. +Good benchmarking practice suggests that one should run the benchmarks +as the only user of a machine, without other resource intensive +or unpredictable processes or daemons. +.NH 2 +Using the \f(CBlmbench\fP database +.PP +\*[lmbench] includes a database of results that +is useful for comparison purposes. It is quite easy to +build the source, run the benchmark, and produce a table of results +that includes the run. All of the tables in this paper were produced +from the database included in \*[lmbench]. This paper is also +included with \*[lmbench] and may be reproduced incorporating new results. +For more information, consult the file \f(CWlmbench-HOWTO\fP in the +\*[lmbench] distribution. +.NH 1 +Systems tested +.PP +\*[lmbench] has been run on a wide variety of platforms. This +paper includes results from a representative subset of machines and +operating systems. +Comparisons between similar hardware running different operating +systems can be very illuminating, and we have included a few examples +in our results. +.PP +The systems are briefly characterized in Table 1. Please note that the list prices +are very approximate as is the year of introduction. +The SPECInt92 numbers are a little suspect since +some vendors have been ``optimizing'' for certain parts of SPEC. We try and +quote the original SPECInt92 numbers where we can. +.NH 2 +Reading the result tables +.PP +Throughout the rest of this paper, we present tables of results for many of the +benchmarks. All of the tables are sorted, from best to worst. Some tables +have multiple columns of results and those tables are sorted on only one of +the columns. The sorted column's heading will be in \fBbold\fP. +.NH 1 +Bandwidth benchmarks +.PP +By bandwidth, we mean the rate at which a particular facility can move +data. +We attempt to measure the data movement ability of a number of +different facilities: +library \*[bcopy], +hand-unrolled \*[bcopy], +direct-memory read and write (no copying), +pipes, +TCP sockets, +the \*[read] interface, +and +the \*[mmap] interface. +.NH 2 +Memory bandwidth +.PP +Data movement is fundamental to any operating system. +In the past, performance +was frequently measured in MFLOPS because floating point units were +slow enough that microprocessor systems were +rarely limited by memory bandwidth. Today, floating point units are usually much +faster than memory bandwidth, so many current MFLOP ratings can not be +maintained using memory-resident data; they are ``cache only'' ratings. +.PP +We measure the ability to +copy, read, and write data over a varying set of sizes. +There are too many results to report all of them here, so we concentrate on +large memory transfers. +.PP +We measure copy bandwidth two ways. The first is the user-level library +\*[bcopy] interface. +The second is a hand-unrolled loop that loads and stores +aligned 8-byte words. +In both cases, we took care to +ensure that the source and destination locations would not map to the same +lines if the any of the caches were direct-mapped. +In order to test memory bandwidth rather than cache bandwidth, +both benchmarks copy an 8M\** area to another 8M area. +(As secondary caches reach 16M, these benchmarks will have to +be resized to reduce caching effects.) +.FS +Some of the PCs had less than 16M of available memory; +those machines copied 4M. +.FE +.PP +The copy results actually represent one-half to one-third of the memory +bandwidth used to obtain those results since we are reading and writing +memory. If the cache line size is larger than the word stored, then +the written cache line will typically be read before it is written. The +actual amount of memory bandwidth used varies because some architectures +have special instructions specifically designed for the \*[bcopy] +function. Those architectures will move twice as much memory as +reported by this benchmark; less advanced architectures move three +times as much memory: the memory read, the memory read because it is +about to be overwritten, and the memory written. +.PP +The \*[bcopy] results reported in Table 2 +may be correlated with John McCalpin's \*[stream] +.RN McCalpin95 +benchmark results in the following manner: +the \*[stream] benchmark reports all of the memory moved +whereas the \*[bcopy] benchmark reports the bytes copied. So our +numbers should be approximately one-half to one-third of his numbers. +.PP +Memory reading is measured by an unrolled loop that sums up a series of +integers. On most (perhaps all) systems measured the integer +size is 4 bytes. The loop is unrolled such that most compilers generate +code that uses a constant offset with the load, resulting in a load and +an add for each word of memory. The add is an integer add that completes +in one cycle on all of the processors. Given that today's processor +typically cycles at 10 or fewer nanoseconds (ns) and that memory is typically 200-1,000 +ns per cache line, the results reported here should be dominated by the +memory subsystem, not the processor add unit. +.PP +The memory contents are added up because almost all C compilers +would optimize out the whole loop when optimization was turned on, and +would generate far too many instructions without optimization. +The solution is to +add up the data and pass the result as an unused argument to the +``finish timing'' function. +.PP +Memory reads represent about one-third to one-half of the \*[bcopy] work, and we expect +that pure reads should run at roughly twice the speed of \*[bcopy]. +Exceptions to this rule should be studied, for exceptions indicate a bug +in the benchmarks, a problem in \*[bcopy], or some unusual hardware. +.TSTART +.so ../Results/tmp/bw_allmem.tbl +.TEND "Memory bandwidth (MB/s)" +.PP +Memory writing is measured by an unrolled loop that stores a value into +an integer (typically a 4 byte integer) and then increments the pointer. +The processor cost of each memory operation is approximately the same +as the cost in the read case. +.PP +The numbers reported in Table \n[TABLE] +are not the raw hardware speed in some cases. +The Power2\** is capable of up to 800M/sec read rates +.FS +Someone described this machine as a $1,000 processor on a $99,000 memory +subsystem. +.FE +.RN McCalpin95 +and HP PA RISC (and other prefetching) +systems also do better if higher levels of code optimization used +and/or the code is hand tuned. +.PP +The Sun libc bcopy in Table \n[TABLE] +is better because they use a hardware specific bcopy +routine that uses instructions new in SPARC V9 that were added specifically +for memory movement. +.PP +The Pentium Pro read rate in Table \n[TABLE] is much higher than the write rate because, +according to Intel, the write transaction turns into a read followed by +a write to maintain cache consistency for MP systems. +.NH 2 +IPC bandwidth +.PP +Interprocess communication bandwidth is frequently a performance issue. +Many Unix applications are composed of several processes communicating +through pipes or TCP sockets. Examples include the \f(CWgroff\fP documentation +system that prepared this paper, the \f(CWX Window System\fP, remote file access, +and \f(CWWorld Wide Web\fP servers. +.PP +Unix pipes are an interprocess communication mechanism implemented as +a one-way byte stream. Each end of the stream has an associated file +descriptor; one is the write descriptor and the other the read +descriptor. +TCP sockets are similar +to pipes except they are bidirectional and can cross machine +boundaries. +.PP +Pipe bandwidth is measured by creating two processes, a writer and a +reader, which transfer 50M of data in 64K transfers. +The transfer size was chosen so that the overhead of system calls +and context switching would not dominate the benchmark time. +The reader prints the timing results, which guarantees that all +data has been moved before the timing is finished. +.PP +TCP bandwidth is measured similarly, except the data is transferred in +1M page aligned transfers instead of 64K transfers. If the TCP +implementation supports it, the send and receive socket buffers are +enlarged to 1M, instead of the default 4-60K. We have found that +setting the transfer size equal to the socket buffer size produces the +greatest throughput over the most implementations. +.TSTART +.so ../Results/tmp/bw_ipc.tbl +.TEND "Pipe and local TCP bandwidth (MB/s)" +.PP +\*[bcopy] is important to this test because the +pipe write/read is typically implemented as a \*[bcopy] into the kernel +from the writer and then a \*[bcopy] from the kernel to the reader. +Ideally, these results would be approximately one-half of the +\*[bcopy] results. It is possible for the kernel \*[bcopy] +to be faster than the C library \*[bcopy] since the kernel may have +access to \*[bcopy] hardware unavailable to the C library. +.PP +It is interesting to compare pipes with TCP because the TCP benchmark is +identical to the pipe benchmark except for the transport mechanism. +Ideally, the TCP bandwidth would be as good as the pipe +bandwidth. It is not widely known that the +majority of the TCP cost is in the \*[bcopy], the checksum, +and the network interface driver. +The checksum and the driver may be safely eliminated in the loopback +case and if the costs have been eliminated, then TCP should be just as +fast as pipes. From the pipe and TCP results in Table \n[TABLE], it is easy to +see that Solaris and HP-UX have done this optimization. +.PP +Bcopy rates in Table \n[TABLE] can be lower than pipe rates because the +pipe transfers are done in 64K buffers, a size that frequently fits in +caches, while the bcopy is typically an 8M-to-8M copy, which does not +fit in the cache. +.PP +In Table \n[TABLE], the SGI Indigo2, a uniprocessor, does better than +the SGI MP on pipe bandwidth because of caching effects - in the UP +case, both processes share the cache; on the MP, each process is +communicating with a different cache. +.PP +All of the TCP results in Table \n[TABLE] are in loopback mode \(em that +is both ends of the socket are on the same machine. It was impossible +to get remote networking results for all the machines included in this +paper. We are interested in receiving more results for identical +machines with a dedicated network connecting them. The results we have +for over the wire TCP bandwidth are shown below. +.TSTART +.so tcp_bw.tbl +.TEND "Remote TCP bandwidth (MB/s)" +.PP +The SGI using 100MB/s Hippi is by far the fastest in Table \n[TABLE]. +The SGI Hippi interface has hardware support for TCP checksums and +the IRIX operating system uses virtual memory tricks to avoid copying +data as much as possible. +For larger transfers, SGI Hippi has reached 92MB/s over TCP. +.PP +100baseT is looking quite competitive when compared to FDDI in Table +\n[TABLE], even though FDDI has packets that are almost three times +larger. We wonder how long it will be before we see gigabit ethernet +interfaces. +.NH 2 +Cached I/O bandwidth +.PP +Experience has shown us that reusing data in the file system +page cache can be a performance issue. This +section measures that operation through two interfaces, \*[read] and +\*[mmap]. +The benchmark here is not an I/O benchmark in that no disk activity is +involved. +We wanted to measure the overhead +of reusing data, an overhead that is CPU intensive, rather than disk intensive. +.PP +The \*[read] interface copies data from the kernel's file system page cache into the +process's buffer, using 64K buffers. The transfer size was chosen +to minimize the kernel entry overhead while +remaining realistically sized. +.PP +The difference between the \*[bcopy] and the \*[read] benchmarks +is the cost of the file and virtual memory system overhead. In most +systems, the \*[bcopy] speed should be faster than the \*[read] speed. The +exceptions usually have hardware specifically designed +for the \*[bcopy] function and that hardware may be available only to +the operating system. +.PP +The \*[read] benchmark is implemented by rereading a file +(typically 8M) in 64K +buffers. Each buffer is summed as a series of integers in the user +process. The summing is done for two reasons: for an apples-to-apples +comparison the memory-mapped benchmark needs to touch all the data, +and the file system can sometimes transfer data into memory faster than the +processor can read the data. +For example, \s-1SGI\s0's XFS can move data into memory at +rates in excess of 500M per second, but it can move data into +the cache at only 68M per second. The intent is to measure performance +delivered to the application, not DMA performance to memory. +.TSTART +.so ../Results/tmp/bw_reread2.tbl +.TEND "File vs. memory bandwidth (MB/s)" +.PP +The \*[mmap] interface provides a way to access the kernel's file cache +without copying the data. +The \*[mmap] benchmark is implemented by mapping the entire file (typically 8M) +into the +process's address space. The file is then summed to force the data +into the cache. +.PP +In Table \n[TABLE], +a good system will have \fIFile read\fP as fast as (or even faster than) +\fILibc bcopy\fP because as the file system overhead goes to zero, the +file reread case is virtually the same as the library \*[bcopy] case. +However, file reread can be faster because the kernel may have access to +\*[bcopy] assist hardware not available to the C library. +Ideally, \fIFile mmap\fP performance should approach \fIMemory read\fP +performance, but \*[mmap] is often dramatically worse. +Judging by the results, this looks to be a +potential area for operating system improvements. +.PP +In Table \n[TABLE] the Power2 does better on file reread than bcopy because it takes +full advantage of the memory subsystem from inside the kernel. +The mmap reread is probably slower because of the lower clock rate; +the page faults start to show up as a significant cost. +.PP +It is surprising that the Sun Ultra1 was able to bcopy at the high +rates shown in Table 2 but did not show those rates for file reread +in Table \n[TABLE]. +HP has the opposite problem, they get file reread faster than bcopy, +perhaps because the kernel \*[bcopy] has access to hardware support. +.PP +The Unixware system has outstanding mmap reread rates, better than +systems of substantially higher cost. Linux needs to do some work on +the \f(CWmmap\fP code. +.NH 1 +Latency measurements +.PP +Latency is an often-overlooked +area of performance problems, possibly because resolving latency issues +is frequently much harder than resolving bandwidth issues. For example, +memory bandwidth may be increased by making wider cache lines and increasing +memory ``width'' and interleave, +but memory latency can be improved only by shortening paths or increasing +(successful) prefetching. +The first step toward improving latency is understanding the +current latencies in a system. +.PP +The latency measurements included in this suite are +memory latency, +basic operating system entry cost, +signal handling cost, +process creation times, +context switching, +interprocess communication, +.\" virtual memory system latency, +file system latency, +and disk latency. +.NH 2 +Memory read latency background +.PP +In this section, we expend considerable effort to define the different memory +latencies and to explain and justify our benchmark. +The background is a bit tedious but important, since we believe the +memory +latency measurements to be one of the most thought-provoking and useful +measurements in \*[lmbench]. +.PP +The most basic latency measurement is memory latency since most of +the other latency measurements can be expressed in terms of memory +latency. For example, context switches require saving the current +process state and loading the state of the next process. However, memory +latency is rarely accurately measured and frequently misunderstood. +.PP +Memory read latency has many definitions; +the most common, +in increasing time order, +are memory chip cycle time, processor-pins-to-memory-and-back time, +load-in-a-vacuum time, and back-to-back-load time. +.BU "Memory chip cycle latency" : +Memory chips are rated in nanoseconds; typical speeds are around 60ns. +A general overview on DRAM architecture may be found in +.RN Hennessy96 . +The +specific information we describe here is from +.RN Toshiba94 +and pertains to the \s-1THM361020AS-60\s0 module and \s-1TC514400AJS\s0 +\s-1DRAM\s0 used in \s-1SGI\s0 workstations. The 60ns time is the +time from +.ps -1 +.nr width \w'R\&A\&S' +.nr height \n[rst]+1000 +RAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u' +.ps +assertion to the when +the data will be available on the \s-1DRAM\s0 pins (assuming +.ps -1 +.nr width \w'C\&A\&S' +.nr height \n[rst]+1000 +CAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u' +.ps +access time requirements were met). +While it is possible +to get data out of a \s-1DRAM\s0 in 60ns, that is not all of +the time involved. There is a precharge time that must occur after +every access. +.RN Toshiba94 +quotes 110ns as the random read or write cycle time and this +time is more representative of the cycle time. +.\" For example, most systems offer a wide range of memory +.\" capacity, from 64MB to 1GB or more. If 64MB simms are used, the number +.\" of simms range from 1 to 16. The more simms there are, the more +.\" capacitance there is in the memory subsystem. More capacitance means +.\" longer setup times for the fully populated memory subsystem. System +.\" designers have to allow time for this setup. +.\" For more details, consult [XXX - reference on DRAM]. +.\" This is sometimes referred to as the chip latency. The +.\" chip cycle time is the chip latency plus the time required to restore +.\" the data in the capacitors which is often referred to as the precharge +.\" time. This means that 60 nanosecond memory chips really are more like +.\" 100 nanosecond memory chips. Some systems operate memory in ``page +.\" mode'' or ``static column'' memory systems hold either RAS or CAS and +.\" allow subsequent accesses in the same row or column in one cycle instead +.\" of two. +.BU "Pin-to-pin latency" : +This number represents the time needed +for the memory request to travel from the processor's pins to the memory +subsystem and back again. Many vendors have used the pin-to-pin +definition of memory latency in their reports. For example, +.RN Fenwick95 +while describing the \s-1DEC\s0 8400 +quotes memory latencies of 265ns; a careful +reading of that paper shows that these are pin-to-pin numbers. In spite +of the historical precedent in vendor reports, this definition of memory +latency is misleading since it ignores actual delays seen when a load +instruction is immediately followed by a use of the data being loaded. +The number of additional cycles inside the processor can be significant +and grows more significant with today's highly pipelined architectures. +.PP +It is worth noting that the pin-to-pin numbers +include the amount of time it takes to charge +the lines going to the \s-1SIMM\s0s, a time that increases with the +(potential) number of \s-1SIMM\s0s in a system. More \s-1SIMM\s0s mean +more capacitance which requires in longer charge times. This is one reason +why personal computers frequently have better memory latencies than +workstations: the PCs typically have less memory capacity. +.BU "Load-in-a-vacuum latency" : +A load in a vacuum is the time that the processor will wait for one load that +must be fetched from main memory (i.e., a cache miss). The ``vacuum'' +means that there is no other activity on the system bus, including no other +loads. +While this number is frequently used as the memory latency, it is not very +useful. It is basically a ``not to exceed'' number important only for +marketing reasons. +Some architects point out that since most processors implement nonblocking +loads (the load does not cause a stall until the data is used), the perceived +load latency may be much less that the real latency. When pressed, however, +most will admit that cache misses occur in bursts, resulting in perceived +latencies of at least the load-in-a-vacuum latency. +.BU "Back-to-back-load latency" : +Back-to-back-load latency is the time that each load takes, assuming +that the instructions before and after are also cache-missing loads. +Back-to-back loads may take longer than loads in a vacuum for the +following reason: many systems implement something known as \fIcritical +word first\fP, which means that the subblock of the cache line that +contains the word being loaded is delivered to the processor before the +entire cache line has been brought into the cache. If another load +occurs quickly enough after the processor gets restarted from the +current load, the second load may stall because the cache is still +busy filling the cache line for the previous load. On some systems, +such as the current implementation of UltraSPARC, +the difference between back to back and load in a vacuum is about 35%. +.PP +\*[lmbench] measures back-to-back-load latency because it is the +only measurement that may be easily measured from software and +because we feel that it is what most software developers consider to be memory +latency. Consider the following C code fragment: +.DS +.nf +.ft CW +p = head; +while (p->p_next) + p = p->p_next; +.ft +.fi +.DE +On a \s-1DEC\s0 Alpha, the loop part turns into three instructions, including the +load. A 300 Mhz processor has a 3.33ns cycle time, so the loop +could execute in slightly less than 10ns. However, the load itself +takes 400ns on a 300 Mhz \s-1DEC\s0 8400. In other words, the +instructions cost 10ns but the load stalls for 400. Another +way to look at it is that 400/3.3, or 121, nondependent, +nonloading instructions following the load would be needed +to hide the load latency. +Because superscalar processors typically execute multiple operations +per clock cycle, they need even more useful operations between cache +misses to keep the processor from stalling. +.PP +This benchmark illuminates the tradeoffs in processor cache design. +Architects like large cache lines, up to 64 bytes or so, because +the prefetch effect of gathering a whole line increases +hit rate given reasonable spatial locality. +Small stride sizes have high spatial locality and should have higher +performance, but large stride sizes have poor spatial locality causing +the system to prefetch useless data. +So the benchmark provides the following insight into negative +effects of large line prefetch: +.BU +Multi-cycle fill operations are typically atomic events at the +caches, and sometimes block other cache accesses until they +complete. +.BU +Caches are typically single-ported. Having a large line prefetch +of unused data causes extra bandwidth +demands at the cache, and can cause increased access latency for +normal cache accesses. +.PP +In summary, we believe that processors are so fast that the average +load latency for cache misses will be closer to the +back-to-back-load number than to the load-in-a-vacuum number. We are +hopeful that the industry will standardize on this definition of +memory latency. +.NH 2 +Memory read latency +.PP +The entire memory hierarchy can be measured, including on-board data +cache latency and size, external data cache latency and size, and +main memory latency. +Instruction caches are not measured. +TLB miss latency can also be measured, as in +.RN Saavedra92 , +but we stopped at main memory. Measuring TLB miss time is problematic +because different systems map different amounts of memory with their +TLB hardware. +.PP +The benchmark varies two parameters, array size and array stride. +For each size, a list of pointers is created for all of the different +strides. Then the list is walked thus: +.DS +.ft CW +mov r4,(r4) # C code: p = *p; +.ft +.DE +The time to do about 1,000,000 loads (the list wraps) is measured and +reported. The time reported is pure latency time and may be zero even though +the load instruction does not execute in zero time. Zero is defined as one +clock cycle; in other words, the time reported is \fBonly\fP memory latency +time, as it does not include the instruction execution time. It is assumed +that all processors can do a load instruction in one processor cycle +(not counting stalls). In other words, if the processor cache load time +is 60ns on a 20ns processor, the load latency reported +would be 40ns, the additional 20ns is for the load instruction +itself.\** +.FS +In retrospect, this was a bad idea because we calculate the clock +rate to get the instruction execution time. If the clock rate is off, +so is the load time. +.FE +Processors that can manage to get the load address out to the +address pins before the end of the load cycle get some free time in this +benchmark (we don't know of any processors that do that). +.PP +This benchmark has been validated by logic analyzer measurements +on an \s-1SGI\s0 Indy by Ron Minnich while he was at the Maryland Supercomputer +Research Center. +.TSTART 1 +.so mem.pic +.FEND "Memory latency" 1 +.PP +Results from the memory latency benchmark are plotted as a series of data sets +as shown in Figure \n[FIGURE]. +Each data set represents a stride size, +with the array size varying from 512 bytes up to 8M or more. +The curves contain a series of +horizontal plateaus, where each plateau represents a level in the +memory hierarchy. +The point where each plateau ends and the line rises marks the +end of that portion of the memory hierarchy (e.g., external cache). +Most machines have similar memory hierarchies: +on-board cache, external cache, main memory, and main memory plus TLB +miss costs. +There are variations: some processors are missing a cache, while +others add another cache to the hierarchy. +.\" XXX Larry please double-check this; I am going on dim memory... +For example, the Alpha 8400 has two on-board caches, one 8K +and the other 96K. +.PP +The cache line size can be derived by comparing curves and noticing which +strides are faster than main memory times. The smallest stride that is +the same as main memory speed is likely to be the cache line size because +the strides that are faster than memory are +getting more than one hit per cache line. +.\" Prefetching may confuse +.\" the issue because a demand read may stall behind a prefetch load, +.\" causing cache lines to appear twice as large as they are. +.\" XXX +.\" Larry --- can we use prime modulus arithmetic to set up pointer +.\" loops which might appear random but which really aren't and which +.\" hit every stride once before looping? +.\" +.\" XXX +.\" Larry --- is there any way we can defeat/disable prefetching +.\" so the cache line size can be more accurately determined? +.\" +.\" XXX +.\" Larry --- can we create a benchmark for TLB misses? +.\" I think it was Tom Rokicki who suggested that we create a +.\" benchmark where the data fits in the cache, but the pages don't +.\" fit in the TLB. +.\" +.\" XXX +.\" Larry --- is the description of the memory hierarchy correct? +.\" I am not sure I haven't added an extra level of external cache... +.EQ +delim $$ +.EN +.PP +Figure \n[FIGURE] shows memory latencies on a nicely made machine, +a \s-1DEC\s0 Alpha. +We use this machine as the example +because it shows the latencies and sizes of +the on-chip level 1 and motherboard level 2 caches, and because it +has good all-around numbers, especially considering it can support a +4M level 2 cache. +The on-board cache is $2 sup 13$ bytes or 8K, while the +external cache is $2 sup 19$ bytes or 512K. +.EQ +delim off +.EN +.TSTART +.so lat_allmem.tbl +.TEND "Cache and memory latency (ns)" +.nr MEMTABLE \n[TABLE] +.PP +Table \n[TABLE] shows the cache size, cache latency, and main memory +latency as extracted from the memory latency graphs. +The graphs and the tools for extracting the data are +included with \*[lmbench]. +It is worthwhile to plot all of the graphs and examine them since the +table is missing some details, such as the +\s-1DEC\s0 Alpha 8400 processor's second 96K on-chip cache. +.PP +We sorted Table \n[TABLE] on level 2 cache latency because we think +that many applications will fit in the level 2 cache. The HP and IBM +systems have only one level of cache so we count that as both level 1 +and level 2. Those two systems have remarkable cache performance for +caches of that size. In both cases, the cache delivers data in one +clock cycle after the load instruction. +.PP +HP systems usually focus on +large caches as close as possible to the processor. A older HP +multiprocessor system, the 9000/890, has a 4M, split I&D, direct mapped +cache with a 2K victim cache, accessible in one clock (16ns).\** That system is +primarily a database server. +.FS +The Usenix version of this paper had this as a set associate cache; that was +incorrect. +.FE +.PP +The IBM focus is on low latency, high +bandwidth memory. The IBM memory subsystem is good because all of +memory is close to the processor, but has the weakness that it is +extremely difficult to evolve the design to a multiprocessor system. +.PP +The 586 and PowerPC motherboards have quite poor second level caches, +the caches are not substantially better than main memory. +.PP +The Pentium Pro and Sun Ultra second level caches are of medium speed +at 5-6 clocks latency each. 5-6 clocks seems fast until it is compared +against the HP and IBM one cycle latency caches of similar size. +Given the tight integration of the Pentium Pro level 2 cache, it is +surprising that it has such high latencies. +.PP +The 300Mhz DEC Alpha has a rather high 22 clock latency to the second +level cache which is probably one of the reasons that they needed a 96K +level 1.5 cache. SGI and DEC have used large second level caches +to hide their long latency from main memory. +.PP +.NH 2 +Operating system entry +.PP +Entry into the operating system is required for many system facilities. +When calculating the cost of a facility, it is useful to know how +expensive it is to perform a nontrivial entry into the operating system. +.PP +We measure nontrivial entry into the system by repeatedly writing one +word to \f(CW/dev/null\fP, a pseudo device driver that does nothing but +discard the data. This particular entry point was chosen because it has +never been optimized in any system that we have measured. Other entry +points, typically \*[getpid] and \*[gettimeofday], are heavily used, +heavily optimized, and sometimes implemented as user-level library +routines rather than system calls. +A write to the \f(CW/dev/null\fP driver will go +through the system call table to \*[write], verify the user area as +readable, look up the file descriptor to get the vnode, call the vnode's +write function, and then return. +.TSTART +.so ../Results/tmp/lat_nullsys.tbl +.TEND "Simple system call time (microseconds)" +.PP +Linux is the clear winner in the system call time. The reasons are +twofold: Linux is a uniprocessor operating system, without any +MP overhead, and Linux is a small operating system, without all +of the ``features'' accumulated by the commercial offers. +.PP +Unixware and Solaris are doing quite well, given that they are both fairly +large, commercially oriented operating systems with a large accumulation +of ``features.'' +.NH 2 +Signal handling cost +.PP +Signals in Unix are a way to tell another process to handle an event. They +are to processes as interrupts are to the CPU. +.PP +Signal handling is often critical to layered systems. Some applications, +such as databases, software development environments, and threading libraries +provide an operating system-like layer on top of the operating system, +making signal handling a critical path in many of these applications. +.PP +\*[lmbench] measure both signal installation and signal dispatching in two separate +loops, within the context of one process. +It measures signal handling by installing a signal handler and then repeatedly +sending itself the signal. +.TSTART +.so ../Results/tmp/lat_signal.tbl +.TEND "Signal times (microseconds)" +.PP +Table \n[TABLE] shows the signal handling costs. +Note that there are no context switches in this benchmark; the signal goes +to the same process that generated the signal. In real applications, +the signals usually go to another process, which implies +that the true cost of sending that signal is the signal overhead plus the +context switch overhead. We wanted to measure signal and context +switch overheads separately since context +switch times vary widely among operating systems. +.PP +SGI does very well on signal processing, +especially since their hardware is of an older generation than +many of the others. +.PP +The Linux/Alpha signal handling numbers are so poor +that we suspect that this is a bug, especially given that the Linux/x86 +numbers are quite reasonable. +.NH 2 +Process creation costs +.PP +Process benchmarks are used to measure the basic process primitives, +such as creating a new process, running a different program, and context +switching. Process creation benchmarks are of particular interest +in distributed systems since many remote operations include the creation +of a remote process to shepherd the remote operation to completion. +Context switching is important for the same reasons. +.BU "Simple process creation" . +The Unix process creation primitive is \*[fork], which +creates a (virtually) exact copy of the calling process. +Unlike VMS and some other operating systems, Unix starts any new process +with a \*[fork]. +Consequently, \*[fork] and/or \f(CWexecve\fP should be fast and +``light,'' facts that many have been ignoring for some time. +.PP +\*[lmbench] measures simple process creation by creating a process +and immediately +exiting the child process. The parent process waits for the child +process to exit. +The benchmark is intended to measure the overhead for creating a +new thread of control, so it includes the \*[fork] and +the \*[exit] time. +.PP +The benchmark also includes a \f(CWwait\fP system call in the parent and +context switches from the parent to the child and back again. Given that +context switches of this sort are on the order of 20 microseconds and a +system call is on the order of 5 microseconds, and that the entire benchmark +time is on the order of a millisecond or more, the extra overhead +is insignificant. +Note that even this relatively simple task is very expensive and is +measured in milliseconds while most of the other operations we consider are +measured in microseconds. +.BU "New process creation" . +The preceding benchmark did not create a new application; it created a +copy of the old application. This benchmark measures the cost of creating a +new process and changing that process into a new application, which. +forms the basis of every Unix command +line interface, or shell. +\*[lmbench] measures this facility by forking a new child and having that child +execute a new program \(em in this case, a tiny program that prints +``hello world'' and exits. +.PP +The startup cost is especially noticeable +on (some) systems that have shared libraries. Shared libraries can +introduce a substantial (tens of milliseconds) startup cost. +.\" XXX - statically linked example? +.TSTART +.so ../Results/tmp/lat_allproc.tbl +.TEND "Process creation time (milliseconds)" +.BU "Complicated new process creation" . +When programs start other programs, they frequently use one of +three standard interfaces: \*[popen], \*[system], and/or \*[execlp]. The first +two interfaces start a new process by invoking the standard command +interpreter, \f(CW/bin/sh\fP, to start the process. Starting programs this way +guarantees that the shell will look for the requested application +in all of the places that the user would look \(em in other words, the shell +uses the user's $PATH variable as a list of places to find the +application. \*[execlp] is a C library routine which also looks for the +program using the user's $PATH variable. +.PP +Since this is a common way of starting applications, we felt it +was useful to show the costs of the generality. +.PP +We measure this by starting \f(CW/bin/sh\fP to start the same tiny +program we ran in the last case. +In Table \n[TABLE] the cost of asking the shell to go +look for the program is +quite large, frequently ten times as expensive as just creating a +new process, and four times as expensive as explicitly naming the location +of the new program. +.PP +The results that stand out in Table \n[TABLE] are the poor Sun Ultra 1 results. +Given that the processor is one of the fastest, the problem is likely to be +software. There is room for substantial improvement in the Solaris +process creation code. +.NH 2 +Context switching +.PP +Context switch time is defined here as +the time needed to save the state of one process and restore the state +of another process. +.PP +Context switches are frequently in the critical performance path of +distributed applications. For example, the multiprocessor versions +of the IRIX operating system use +processes to move data through the networking stack. This means that the +processing time for each new packet arriving at an idle system includes +the time needed to switch in the networking process. +.PP +Typical context switch benchmarks measure just the minimal context switch +time \(em the time to switch between two processes that are doing nothing +but context switching. We feel that this is +misleading because there are frequently more than two active processes, +and they usually have a larger working set (cache footprint) +than the benchmark processes. +.PP +Other benchmarks frequently include the cost of +the system calls needed to force the context switches. +For example, Ousterhout's context switch benchmark +measures context switch time plus a \*[read] and a \*[write] +on a pipe. +In many of the systems measured by \*[lmbench], the pipe overhead +varies between 30% and 300% of the context switch time, so we were +careful to factor out the pipe overhead. +.BU "Number of processes." +The context switch benchmark is implemented as +a ring of two to twenty processes that are connected with Unix pipes. +A token is passed from process to process, forcing context switches. +The benchmark measures the time needed to pass +the token two thousand times from process to process. +Each transfer of the token has two costs: the context switch, and +the overhead of passing the token. +In order to calculate just the context switching time, the benchmark first +measures the cost of passing the token through a ring of pipes in a +single process. This overhead time is defined as the cost of passing +the token and is not included in the reported context switch time. +.BU "Size of processes." +In order to measure more realistic context switch times, we add +an artificial variable size ``cache footprint'' to the switching +processes. The cost of the context switch then includes the cost +of restoring user-level state (cache footprint). The cache footprint +is implemented by having the process allocate an array of data\** +.FS +All arrays are at the same virtual +address in all processes. +.FE +and sum +the array as a series of integers after receiving the token but before +passing the token to the next process. Since most systems will cache data +across context switches, the working set for the benchmark is slightly +larger than the number of processes times the array size. +.PP +It is worthwhile to point out that the overhead mentioned above +also includes the cost of accessing the data, in the same way as +the actual benchmark. However, because the overhead is measured +in a single process, the cost is typically the cost with ``hot'' +caches. In the Figure 2, each size is plotted as a line, with +context switch times on the Y axis, number of processes on the +X axis, and the process size as the data set. +The process size and the hot cache overhead costs for +the pipe read/writes and any data access is what is labeled +as \f(CWsize=0KB overhead=10\fP. The size is in kilobytes and the overhead +is in microseconds. +.PP +The context switch time does not include anything other than +the context switch, provided that all the benchmark processes fit in the +cache. If the total size of all of the benchmark processes is larger +than the cache size, the cost of each context switch will include cache +misses. +We are trying to show realistic context switch times as a +function of both size and number of processes. +.TSTART 1 +.so ctx.pic +.FEND "Context switch times" 1 +.PP +Results for an Intel Pentium Pro system running Linux at 167 MHz are +shown in Figure \n[FIGURE]. +The data points on the figure are labeled with the working set +due to the sum of data in all of the processes. The actual working set is +larger, as it includes the process and kernel overhead as well. +One would expect the context switch times to stay constant until +the working set is +approximately the size of the second level cache. The Intel system has a +256K second level cache, and the context switch times +stay almost constant until about 256K (marked as .25M in the graph). +.BU "Cache issues" +The context switch benchmark is a deliberate measurement of the +effectiveness of the caches across process context switches. If the +cache does not include the process identifier (PID, also sometimes +called an address space identifier) as part of the address, then the +cache must be flushed on every context switch. If the cache does not map +the same virtual addresses from different processes to different cache +lines, then the cache will appear to be flushed on every context +switch. +.PP +If the caches do +not cache across context switches there would be no grouping at the +lower left corner of Figure \n[FIGURE], instead, the graph would +appear as a series of straight, horizontal, parallel lines. The number +of processes will not matter, the two process case will be just as bad +as the twenty process case since the cache would not be +useful across context switches. +.TSTART +.so ../Results/tmp/ctx.tbl +.TEND "Context switch time (microseconds)" +.PP +We picked four points on the graph and extracted those values for Table +\n[TABLE]. The complete set of values, as well as tools to graph them, +are included with \*[lmbench]. +.PP +Note that multiprocessor context switch times are frequently more expensive +than uniprocessor context switch times. This is because multiprocessor +operating systems tend to have very complicated scheduling code. +We believe that multiprocessor context switch times can be, and should be, +within 10% of the uniprocessor times. +.PP +Linux does quite well on context switching, especially on the more +recent architectures. By comparing the Linux 2 0K processes to the +Linux 2 32K processes, it is apparent that there is something wrong +with the Linux/i586 case. If we look back to Table \n[MEMTABLE], we can +find at least part of the cause. The second level cache latency for the +i586 is substantially worse than either the i686 or the Alpha. +.PP +Given the poor second level cache behavior of the PowerPC, it is surprising +that it does so well on context switches, especially the larger sized cases. +.PP +The Sun Ultra1 context switches quite well in part because of enhancements +to the register window handling in SPARC V9. +.NH 2 +Interprocess communication latencies +.PP +Interprocess communication latency is important because many operations +are control messages to another process (frequently on another +system). The time to tell the remote process to +do something is pure overhead and is frequently in the critical path +of important functions such as distributed applications (e.g., +databases, network servers). +.PP +The interprocess communication latency benchmarks typically have the +following form: pass a small message (a byte or so) back and forth between two +processes. The reported results are always the microseconds needed +to do one round trip. For one way timing, +about half the round trip is right. However, the CPU cycles tend to be +somewhat asymmetric for one trip: receiving is typically more +expensive than sending. +.BU "Pipe latency" . +Unix pipes are an interprocess communication mechanism implemented as +a one-way byte stream. Each end of the stream has an associated file +descriptor; one is the write descriptor and the other the read +descriptor. +.PP +Pipes are frequently used as a local IPC mechanism. Because of the +simplicity of pipes, they are frequently the fastest portable +communication mechanism. +.PP +Pipe latency is measured by creating a pair of pipes, forking a child process, +and passing a word back and forth. This benchmark is identical to the +two-process, zero-sized context switch benchmark, except that it includes +both the context switching time and the pipe overhead in the results. +.nr NTABLE \n[TABLE]+1 +.nr LTABLE \n[TABLE] +Table \n[NTABLE] shows the round trip latency from process A to process B +and back to process A. +.TSTART +.so ../Results/tmp/lat_pipe.tbl +.TEND "Pipe latency (microseconds)" +.PP +The time can be broken down to two context switches plus four system calls +plus the pipe overhead. The context switch component is two of the small +processes in Table \n[LTABLE]. +This benchmark is identical to the context switch benchmark in +.RN Ousterhout90 . +.BU "TCP and RPC/TCP latency" . +TCP sockets may be viewed as an interprocess communication mechanism similar +to pipes with the added feature that TCP sockets work across machine +boundaries. +.PP +TCP and RPC/TCP connections are frequently used in low-bandwidth, +latency-sensitive applications. The default Oracle distributed +lock manager uses TCP sockets, and the locks per second available +from this service are accurately modeled by the TCP latency test. +.TSTART +.so ../Results/tmp/lat_tcp.tbl +.TEND "TCP latency (microseconds)" +.PP +Sun's RPC is layered either over TCP or over UDP. +The RPC layer is responsible for managing connections (the port mapper), +managing different byte orders and word sizes (XDR), and implementing a +remote procedure call abstraction. +Table \n[TABLE] shows the same benchmark with and +without the RPC layer to show the cost of the RPC implementation. +.PP +TCP latency is measured by having a server process that waits for connections +and a client process that connects to the server. The two processes then +exchange a word between them in a loop. The latency reported is one +round-trip time. The measurements in Table \n[TABLE] are local +or loopback measurements, +since our intent is to show the overhead of the software. The same benchmark +may be, and frequently is, used to measure host-to-host latency. +.PP +Note that the RPC layer frequently adds hundreds of microseconds of +additional latency. The problem is not the external data +representation (XDR) layer \(em the +data being passed back and forth is a byte, so there is no XDR to be done. +There is no justification for the extra cost; it is simply +an expensive implementation. DCE RPC is worse. +.TSTART +.so ../Results/tmp/lat_udp.tbl +.TEND "UDP latency (microseconds)" +.BU "UDP and RPC/UDP latency" . +UDP sockets are an alternative to TCP sockets. They differ in that UDP +sockets are unreliable messages that leave the retransmission issues to +the application. UDP sockets have a few advantages, however. They preserve +message boundaries, whereas TCP does not; and a single UDP socket may +send messages +to any number of other sockets, whereas TCP sends data to only one place. +.PP +UDP and RPC/UDP messages are commonly used in many client/server applications. +NFS is probably the most widely used RPC/UDP application in the world. +.PP +Like TCP latency, UDP latency is measured by having a server process +that waits for connections +and a client process that connects to the server. The two processes then +exchange a word between them in a loop. The latency reported is round-trip +time. The measurements in Table \n[TABLE] are local or loopback measurements, +since our intent is to show the overhead of the software. +Again, note that the RPC library can add hundreds of microseconds of extra +latency. +.\" .PP +.\" It is interesting to compare UDP latency with TCP latency. In many cases the +.\" TCP latency is \fBless\fP than the UDP latency. This flies in the face +.\" of conventional wisdom, which says that TCP is an inherently more expensive +.\" protocol than UDP. The reasons that TCP may appear faster are: in this +.\" benchmark, the protocol costs are dwarfed by the other costs (context +.\" switching, system calls, and driver overhead); and TCP is frequently +.\" hand-tuned for performance, while UDP is rarely hand-tuned. +.TSTART +.so ipc.tbl +.TEND "Remote latencies (microseconds)" +.BU "Network latency" . +We have a few results for over the wire latency included in Table \n[TABLE]. +As might be expected, the most heavily used network interfaces (i.e., ethernet) +have the lowest latencies. The times shown include the time on the wire, +which is about 130 microseconds for 10Mbit ethernet, 13 microseconds for 100Mbit +ethernet and FDDI, and less than 10 microseconds for Hippi. +.BU "TCP connection latency" . +TCP is a connection-based, reliable, byte-stream-oriented protocol. As +part of this reliability, a connection must be established before any +data can be transferred. The connection is accomplished by a ``three-way +handshake,'' an exchange of packets when the client attempts to connect +to the server. +.PP +Unlike UDP, where no connection is established, TCP sends packets +at startup time. If an application creates a TCP connection to send +one message, then the startup time can be a substantial +fraction of the total connection and transfer costs. +The benchmark shows that the connection cost is approximately half of +the cost. +.PP +Connection cost is measured by having a server, registered using +the port mapper, waiting for connections. The client figures out where the +server is registered and then repeatedly times a \*[connect] system call to +the server. The socket is closed after each connect. Twenty connects +are completed and the fastest of them is used as the result. The time measured +will include two of the three packets that make up the three way TCP handshake, +so the cost is actually greater than the times listed. +.\" XXX Larry --- if a machine's clock granularity is on the order of +.\" 10 milliseconds, won't this benchmark run into granularity problems? +.TSTART +.so ../Results/tmp/lat_connect.tbl +.TEND "TCP connect latency (microseconds)" +.PP +Table \n[TABLE] shows that if the need is to send +a quick message to another process, given that most packets get through, +a UDP message will cost a \f(CWsend\fP and a \f(CWreply\fP (if positive +acknowledgments are needed, which they are in order to have an apples-to-apples +comparison with TCP). If the transmission medium is 10Mbit Ethernet, the +time on the wire will be approximately 65 microseconds each way, or 130 +microseconds total. To do the same thing with a short-lived TCP +connection would cost 896 microseconds of wire time alone. +.PP +The comparison is not meant to disparage TCP; TCP is a useful protocol. Nor +is the point to suggest that all messages should be UDP. In many cases, +the difference between 130 microseconds and 900 microseconds is +insignificant compared with other aspects of application performance. +However, if the application is very latency sensitive +and the transmission medium is slow (such as serial link or a message +through many routers), then a UDP message may prove cheaper. +.NH 2 +File system latency +.PP +File system latency is defined as the time required to create or delete +a zero length file. +We define it this way because in many file systems, +such as the BSD fast file system, the directory operations are done +synchronously in order to maintain on-disk integrity. Since the +file data is typically cached and sent to disk at some later date, +the file creation and deletion become the bottleneck +seen by an application. This bottleneck is substantial: to do +a synchronous update to a disk is a matter of tens of milliseconds. +In many cases, this bottleneck is much more of a perceived performance +issue than processor speed. +.PP +The benchmark creates 1,000 zero-sized files and then deletes them. +All the files are created in one directory and their names are +short, such as "a", "b", "c", ... "aa", "ab", .... +.TSTART +.so lat_fs.tbl +.TEND "File system latency (microseconds)" +.PP +The create and delete latencies are shown in Table \n[TABLE]. +Notice that Linux does extremely well here, 2 to 3 orders of magnitude faster +than the slowest systems. However, Linux does not guarantee +anything about the disk integrity; the directory operations are done in +memory. Other fast systems, such as SGI's XFS, use a log to guarantee the +file system integrity. +The slower systems, all those with ~10 millisecond file latencies, are +using synchronous writes to guarantee the file system integrity. +Unless Unixware has modified UFS substantially, they must be running in +an unsafe mode since the FreeBSD UFS is much slower and both file +systems are basically the 4BSD fast file system. +.NH 2 +Disk latency +.\" XXX - either get more results for this benchmark or delete it. +.\" I'd really like to not delete it - lmdd is probably the most +.\" useful tool and it gets the least press. +.PP +Included with \*[lmbench] is a small benchmarking program useful for +measuring disk and file I/O. \*[lmdd], which is patterned after +the Unix utility \f(CWdd\fP, measures both sequential and random I/O, +optionally generates patterns on output and checks them on input, +supports flushing the data from the buffer cache on systems that +support \f(CWmsync\fP, and has a very flexible user interface. +Many I/O benchmarks can be trivially replaced with a \f(CWperl\fP script +wrapped around \*[lmdd]. +.PP +While we could have generated both sequential and random I/O results as +part of this paper, we did not because those benchmarks are heavily +influenced by the performance of the disk drives used in the test. We +intentionally measure only the system overhead of a SCSI command since +that overhead may become a bottleneck in large database configurations. +.PP +Some important applications, such as transaction processing, are +limited by random disk IO latency. +Administrators can increase the number of disk operations per +second by buying more disks, until the processor overhead becomes +the bottleneck. +The \f(CWlmdd\fP benchmark measures the processor overhead associated with each +disk operation, and it can provide an upper bound on the number of +disk operations the processor can support. +It is designed for SCSI disks, and it assumes that most +disks have 32-128K read-ahead buffers and that they can read ahead +faster than the processor can request the chunks of data.\** +.FS +This may not always be true: a processor could be fast enough to make the +requests faster than the rotating disk. +If we take 6M/second to be disk +speed, and divide that by 512 (the minimum transfer size), that is 12,288 IOs/second, or +81 microseconds/IO. We don't know of any processor/OS/IO controller +combinations that can do an IO in 81 microseconds. +.FE +.PP +The benchmark simulates a large number of disks by reading 512byte +transfers sequentially from the raw disk device (raw disks are unbuffered +and are not read ahead by Unix). +Since the disk can read ahead faster than the system can request +data, the benchmark is doing small transfers of data from the +disk's track buffer. +Another way to look at this is that the benchmark +is doing memory-to-memory transfers across a SCSI channel. +It is possible to generate loads of more than 1,000 SCSI +operations/second on a single SCSI disk. For comparison, disks under +database load typically run at 20-80 operations per second. +.TSTART +.so ../Results/tmp/lat_disk.tbl +.TEND "SCSI I/O overhead (microseconds)" +.PP +The resulting overhead number represents a +\fBlower\fP bound on the overhead of a disk I/O. +The real overhead numbers will be higher on SCSI systems because +most SCSI controllers will not disconnect if the request can be +satisfied immediately. +During the benchmark, the processor simply sends the request and +transfers the data, while +during normal operation, the processor will send the request, +disconnect, get interrupted, reconnect, and transfer the data. +.PP +This technique can be used to discover how many drives a system can support +before the system becomes CPU-limited because it can produce the +overhead load of a fully configured system with just a few disks. +.NH 1 +Future work +.PP +There are several known improvements and extensions that could be made +to \*[lmbench]. +.BU "Memory latency" . +The current benchmark measures clean-read latency. By clean, we mean that +the cache lines being replaced are highly likely to be unmodified, so there +is no associated write-back cost. We would like to extend the benchmark +to measure dirty-read latency, as well as write latency. Other changes +include making the benchmark impervious to sequential prefetching and +measuring TLB miss cost. +.BU "MP benchmarks" . +None of the benchmarks in \*[lmbench] is designed to measure any +multiprocessor features directly. At a minimum, we could measure +cache-to-cache latency as well as cache-to-cache bandwidth. +.BU "Static vs. dynamic processes" . +In the process creation section, we allude to the cost of starting up processes +that use shared libraries. When we figure out how to create statically linked +processes on all or most systems, we could quantify these costs exactly. +.BU "McCalpin's stream benchmark" . +We will probably incorporate part or all of this benchmark into \*[lmbench]. +.BU "Automatic sizing" . +We have enough technology that we could determine the size of the external +cache and autosize the memory used such that the external cache had no effect. +.BU "More detailed papers" . +There are several areas that could yield some interesting papers. The +memory latency section could use an in-depth treatment, and the +context switching section could turn into an interesting discussion of +caching technology. +.NH 1 +Conclusion +.PP +\*[lmbench] is a useful, portable micro-benchmark suite designed to +measure important aspects of system performance. We have found that a good +memory subsystem is at least as important as the processor speed. +As processors get faster and faster, more and more of the system design +effort will need to move to the cache and memory subsystems. +.NH 1 +Acknowledgments +.PP +Many people have provided invaluable help and insight into both the +benchmarks themselves and the paper. The \s-1USENIX\s0 reviewers +were especially helpful. +We thank all of them +and especially thank: +Ken Okin \s-1(SUN)\s0, +Kevin Normoyle \s-1(SUN)\s0, +Satya Nishtala \s-1(SUN)\s0, +Greg Chesson \s-1(SGI)\s0, +John Mashey \s-1(SGI)\s0, +Neal Nuckolls \s-1(SGI)\s0, +John McCalpin \s-1(Univ. of Delaware)\s0, +Ron Minnich \s-1(Sarnoff)\s0, +Chris Ruemmler \s-1(HP)\s0, +Tom Rokicki \s-1(HP)\s0, +and +John Weitz \s-1(Digidesign)\s0. +.PP +We would also like to thank all of the people that have run the +benchmark and contributed their results; none of this would have been possible +without their assistance. +.PP +Our thanks to +all of the free software community for tools that were used during this +project. +\*[lmbench] is currently developed on Linux, a copylefted Unix written by +Linus Torvalds and his band of happy hackers. +This paper and all of the +\*[lmbench] documentation was produced using +the \f(CWgroff\fP suite of tools written by James Clark. +Finally, all of the data processing of the results is done with +\f(CWperl\fP written by Larry Wall. +.PP +Sun Microsystems, and in particular Paul Borrill, +supported the initial development of this project. Silicon Graphics +has supported ongoing development that turned into far more time then we +ever imagined. We are grateful to both of these companies for their +financial support. +.NH 1 +Obtaining the benchmarks +.PP +The benchmarks are available at +.ft I +http://reality.sgi.com/employees/lm_engr/lmbench.tgz +.ft +as well as via a mail server. +You may request the latest version of \*[lmbench] by sending email +to \fIarchives@xxxxxxxxxxxxxxxxxxx\fP with \fIlmbench-current*\fP +as the subject. +.\" .R1 +.\" bibliography references +.\" .R2 +.\"******************************************************************** +.\" Redefine the IP paragraph format so it won't insert a useless line +.\" break when the paragraph tag is longer than the indent distance +.\" +.de @IP +.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) +.par*start \\n[\\n[.ev]:ai] 0 +.if !'\\$1'' \{\ +. \" Divert the label so as to freeze any spaces. +. di par*label +. in 0 +. nf +\&\\$1 +. di +. in +. fi +. chop par*label +. ti -\\n[\\n[.ev]:ai]u +. ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c +. el \{\ +\\*[par*label] +.\". br +. \} +. rm par*label +.\} +.. +.\"******************************************************************** +.\" redefine the way the reference tag is printed so it is enclosed in +.\" square brackets +.\" +.de ref*end-print +.ie d [F .IP "[\\*([F]" 2 +.el .XP +\\*[ref*string] +.. +.\"******************************************************************** +.\" Get journal number entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-N +.ref*field N "" ( ) +.. +.\"******************************************************************** +.\" Get journal volume entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-V +.ref*field V , "" "" "" +.. +.\"******************************************************************** +.\" Get the date entry right. Should not be enclosed in parentheses. +.\" +.de ref*add-D +.ref*field D "," +.. +.R1 +accumulate +sort A+DT +database references +label-in-text +label A.nD.y-2 +bracket-label [ ] ", " +bibliography references +.R2 +.so bios diff --git a/performance/lmbench3/doc/userguide.ms b/performance/lmbench3/doc/userguide.ms new file mode 100755 index 0000000..9bf3f4f --- /dev/null +++ b/performance/lmbench3/doc/userguide.ms @@ -0,0 +1,3782 @@ +.\" This document is GNU groff -mgs -t -p -R -s +.\" It will not print with normal troffs, it uses groff features, in particular, +.\" long names for registers & strings. +.\" Deal with it and use groff - it makes things portable. +.\" +.\" $X$ xroff -mgs -t -p -R -s $file +.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more +.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr +.VARPS +.\" Define a page top that looks cool +.\" HELLO CARL! To turn this off, s/PT/oldPT/ +.de PT +.tl '\fBDRAFT\fP'\\*(DY'\fBDRAFT\fP' +.. +.de lmPT +.if \\n%>1 \{\ +. sp -.1i +. ps 14 +. ft 3 +. nr big 24 +. nr space \\w'XXX' +. nr titlewid \\w'\\*[title]' +. nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 +. ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' +. ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 +. ce 1 +\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] +. ps +. sp -.70 +. ps 12 +\\l'\\n[LL]u' +. ft +. ps +.\} +.. +.\" Define a page bottom that looks cool +.\" HELLO CARL! To turn this off, s/BT/oldBT/ +.de BT +.tl '\(co 2002 \\*[author]'%'\fB\\*(DY DRAFT DO NOT DISTRIBUTE\fP' +.. +.de lmBT +. ps 9 +\v'-1'\\l'\\n(LLu' +. sp -1 +. tl '\(co 2002 \\*[author]'\\*(DY'%' +. ps +.. +.de SP +. if t .sp .5 +. if n .sp 1 +.. +.de BU +. SP +. ne 2 +\(bu\ +. if \\n[.$] \fB\\$1\fP\\$2 +.. +.nr FIGURE 0 +.nr TABLE 0 +.nr SMALL .25i +.de TSTART +. KF +. if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 +. ps -1 +. vs -1 +.. +.de TEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr TABLE \\n[TABLE]+1 +. ce 1 +\fBTable \\n[TABLE].\ \ \\$1\fP +. SP +. KE +.. +.de FEND +. ps +1 +. vs +1 +. if \\n[.$]=2 \{\ +. sp -.5 +\s(24\\l'\\n[pg@colw]u'\s0 \} +. sp .25 +. nr FIGURE \\n[FIGURE]+1 +. ce 1 +\fBFigure \\n[FIGURE].\ \ \\$1\fP +. SP +. KE +.. +.\" Configuration +.nr PI 3n +.nr HM 1i +.nr FM 1i +.nr PO 1i +.if t .po 1i +.nr LL 6.5i +.if n .nr PO 0i +.if n .nr LL 7.5i +.nr PS 10 +.nr VS \n(PS+1 +.ds title Measuring scalability +.ds author Carl Staelin +.ds lmbench \f(CWlmbench\fP +.ds lmbench1 \f(CWlmbench1\fP +.ds lmbench2 \f(CWlmbench2\fP +.ds lmbench3 \f(CWlmbench3\fP +.ds bcopy \f(CWbcopy\fP +.ds benchmp \f(CWbenchmp\fP +.ds bw_file_rd \f(CWbw_file_rd\fP +.ds bw_mem \f(CWbw_mem\fP +.ds bw_mmap_rd \f(CWbw_mmap_rd\fP +.ds bw_pipe \f(CWbw_pipe\fP +.ds bw_tcp \f(CWbw_tcp\fP +.ds bw_udp \f(CWbw_udp\fP +.ds bw_unix \f(CWbw_unix\fP +.ds connect \f(CWconnect\fP +.ds execlp \f(CWexeclp\fP +.ds execve \f(CWexecve\fP +.ds exit \f(CWexit\fP +.ds fcntl \f(CWfcntl\fP +.ds fork \f(CWfork\fP +.ds fstat \f(CWfstat\fP +.ds gcc \f(CWgcc\fP +.ds getpid \f(CWgetpid\fP +.ds getppid \f(CWgetppid\fP +.ds gettimeofday \f(CWgettimeofday\fP +.ds kill \f(CWkill\fP +.ds lat_connect \f(CWlat_connect\fP +.ds lat_ctx \f(CWlat_ctx\fP +.ds lat_fcntl \f(CWlat_fcntl\fP +.ds lat_fifo \f(CWlat_fifo\fP +.ds lat_fs \f(CWlat_fs\fP +.ds lat_http \f(CWlat_http\fP +.ds lat_mem_rd \f(CWlat_mem_rd\fP +.ds lat_mmap \f(CWlat_mmap\fP +.ds lat_ops \f(CWlat_ops\fP +.ds lat_pagefault \f(CWlat_pagefault\fP +.ds lat_pipe \f(CWlat_pipe\fP +.ds lat_proc \f(CWlat_proc\fP +.ds lat_rpc \f(CWlat_rpc\fP +.ds lat_select \f(CWlat_select\fP +.ds lat_sem \f(CWlat_sem\fP +.ds lat_sig \f(CWlat_sig\fP +.ds lat_syscall \f(CWlat_syscall\fP +.ds lat_tcp \f(CWlat_tcp\fP +.ds lat_udp \f(CWlat_udp\fP +.ds lat_unix \f(CWlat_unix\fP +.ds lat_unix_connect \f(CWlat_unix_connect\fP +.ds line \f(CWline\fP +.ds lmdd \f(CWlmdd\fP +.ds lmdd \f(CWlmdd\fP +.ds memmove \f(CWmemmove\fP +.ds mhz \f(CWmhz\fP +.ds mmap \f(CWmmap\fP +.ds par_mem \f(CWpar_mem\fP +.ds par_ops \f(CWpar_ops\fP +.ds pipe \f(CWpipe\fP +.ds popen \f(CWpopen\fP +.ds read \f(CWread\fP +.ds select \f(CWselect\fP +.ds semop \f(CWsemop\fP +.ds sh \f(CW/bin/sh\fP +.ds stat \f(CWstat\fP +.ds stream \f(CWstream\fP +.ds system \f(CWsystem\fP +.ds tlb \f(CWtlb\fP +.ds uiomove \f(CWuiomove\fP +.ds write \f(CWwrite\fP +.ds yield \f(CWyield\fP +.\" References stuff +.de RN \"Reference Name: .RN $1 -- prints the reference prettily +.\" [\s-2\\$1\s+2]\\$2 +[\s-1\\$1\s0]\\$2 +.. +.\" .R1 +.\" sort A+DT +.\" database references +.\" label-in-text +.\" label A.nD.y-2 +.\" bracket-label \*([. \*(.] ", " +.\" .R2 +.EQ +delim $$ +.EN +.TL +\s(14lmbench user guide\s0 +.AU +\s+2\fR\*[author]\fP\s0 +.AI +\fI\s+2Hewlett-Packard Laboratories Israel\s0\fP +.SP +.AB +\*[lmbench] is a micro-benchmark suite designed to focus +attention on the basic building blocks of many +common system applications, such as databases, simulations, +software development, and networking. +It is also designed to make it easy for users to create +additional micro-benchmarks that can measure features, +algorithms, or subsystems of particular interest to the +user. +.SP +There is a timing harness, \*[benchmp], designed +to measure performance at specific levels of parallel +(simultaneous) load. +.AE +.if t .MC 3.05i +.NH 1 +Introduction +.LP +\*[lmbench] is a widely used suite of micro-benchmarks +that measures important aspects of computer system +performance, such as memory latency and bandwidth. +Crucially, the suite is written in portable ANSI-C +using POSIX interfaces and is intended to run on a +wide range of systems without modification. +.LP +The benchmarks included in the suite were chosen +because in the \*[lmbench] developer's experience, +they each represent an aspect of system performance +which has been crucial to an application's +performance. +.LP +In general the benchmarks report either the latency +or bandwidth of an operation or data pathway. The +exceptions are generally those benchmarks that +report on a specific aspect of the hardware, such +as the processor clock rate, which is reported +in MHz and nanoseconds. +.LP +\*[lmbench] consists of three major components: +a timing harness, the individual benchmarks +built on top of the timing harness, and the +various scripts and glue that build and run the +benchmarks and process the results. +.NH 2 +\*[lmbench] history +.LP +\*[lmbench1] was written by Larry McVoy +while he was at Sun Microsystems. It focussed +on two measures of system performance: latency +and bandwidth. It measured a number of basic +operating system functions, such as file system +read/write bandwidth or file creation time. It +also focussed a great deal of energy on measuring +data transfer operations, such as \*[bcopy] and +\*[pipe] latency and bandwidth as well as raw +memory latency and bandwidth. +.LP +Shortly after +.RN McVoy96 +was published, +.RN Brown97 +examined the \*[lmbench] benchmarks and published +a detailed critique of its strengths and weaknesses. +Largely in response to these remarks, development +of \*[lmbench2] began with a focus on +improving the experimental design and statistical +data analysis. The primary change was the development +and adoption across all the benchmarks of a timing +harness that incorporated loop-autosizing and clock +resolution detection. In addition, each experiment +was typically repeated eleven times with the median +result reported to the user. +.LP +\*[lmbench3] focussed on extending +\*[lmbench]'s functionality along two dimensions: +measuring multi-processor scalability and measuring +basic aspects of processor architecture. +.LP +There are any number of aspects of a computer's +micro-architecture that can impact a program's +performance, such as the design of the memory +hierarchy and the basic performance of the various +arithmetic units. +.LP +All of the new benchmarks were added to \*[lmbench] +because the author needed them to help guide his +design decisions in one or more projects over the +last few years. +For example, \*[lat_ops] was added because the +author was trying to decide whether a particular +image processing algorithm should be implemented +using integer or floating point arithmetic. +Floating point arithmetic was preferred for a +variety of reasons, but it was feared that +floating point arithmetic would be prohibitively +expensive compared to integer operations. +By quickly building \*[lat_ops] the author was +able to verify that the floating point performance +should be no worse than integer performance. +.LP +An important feature of multi-processor systems is their +ability to scale their performance. \*[lmbench1] +was able to measure various important aspects of +system performance, except that only one client process +was active at a time +.RN McVoy96 . +\*[lmbench2] introduced a new macro, BENCH(), which +implemented a sophisticated timing harness that +automatically managed nearly all aspects of accurately +timing operations +.RN Staelin98 . +For example, it automatically +detects the minimal timing interval necessary to +provide timing results within 1% accuracy, and it +automatically repeats most experiments eleven times +and reports the median result. +.LP +However, this timing harness is incapable of measuring +the performance of a system under scalable loads. +\*[lmbench3] took the ideas and techniques +developed in the earlier versions and extended them +to create a new timing harness which can measure +system performance under parallel, scalable loads. +.LP +\*[lmbench3] also includes a version of John +McCalpin's STREAM benchmarks. Essentially the STREAM +kernels were placed in the new \*[lmbench] timing harness. +Since the new timing harness also measures scalability +under parallel load, the \*[lmbench3] STREAM +benchmarks include this capability automatically. +.LP +Finally, \*[lmbench3] includes a number of new +benchmarks which measure various aspects of the +processor architecture, such as basic operation +latency and parallelism, to provide developers +with a better understanding of system capabilities. +The hope is that better informed developers will +be able to better design and evaluate performance +critical software in light of their increased +understanding of basic system performance. +.NH 1 +Prior Work +.LP +Benchmarking is not a new field of endeavor. +There are a wide variety of approaches to +benchmarking, many of which differ greatly +from that taken by \*[lmbench]. +.LP +One common form of benchmark is to take an +important application or application and +worklist, and to measure the time required +to complete the entire task. +This approach is particularly useful when +evaluating the utility of systems for a +single and well-known task. +.LP +Other benchmarks, such as SPECint, use a +variation on this approach by measuring +several applications and combining the +results to predict overall performance. +.\" .LP +.\" XXX Byte benchmark +.LP +Another variation takes the "kernel" of +an important application and measures its +performance, where the "kernel" is usually +a simplification of the most expensive +portion of a program. +Dhrystone +.RN Weicker84 +is an example of this type of +benchmark as it measures the performance +of important matrix operations and was often +used to predict system performance for +numerical operations. +.LP +.RN Banga98 +developed a benchmark to measure HTTP server +performance which can accurately measure +server performance under high load. +Due to the idiosyncracies of the HTTP protocol +and TCP design and implementation, there are +generally operating system limits on the rate +at which a single system can generate +independent HTTP requests. +However, +.RN Banga98 +developed a system which can scalably present +load to HTTP servers in spite of this limitation. +.LP +John McCalpin's STREAM benchmark measures +memory bandwidth during four common vector +operations +.RN McCalpin95 . +It does not measure memory latency, and +strictly speaking it does not measure raw +memory bandwith although memory bandwidth +is crucial to STREAM performance. +More recently, work has begun on extending +STREAM to measure scalable memory subsystem +performance, particularly for multi-processor +machines. +.LP +Uros Prestor +.RN Prestor01 +XXX +.LP +Micro-benchmarking extends this "kernel" +approach, by measuring the performance +of operations or resources in isolation. +\*[lmbench] and many other benchmarks, such +as nfsstone +.RN Shein89 , +measure the performance of key operations so +users can predict performance for certain +workloads and applications by combining the +performance of these operations in the right +mixture. +.LP +.RN Saavedra92 +takes the micro-benchmark approach and applies +it to the problem of predicting application +performance. +They analyze applications or other benchmarks +in terms of their ``narrow spectrum benchmarks'' +to create a linear model of the application's +computing requirements. +They then measure the computer system's +performance across this set of micro-benchmarks +and use a linear model to predict the application's +performance on the computer system. +.RN Seltzer99 +applied this technique using the features +measured by \*[lmbench] as the basis for +application prediction. +.LP +Benchmarking I/O systems has proven particularly +troublesome over the years, largely due to the +strong non-linearities exhibited by disk systems. +Sequential I/O provides much higher bandwidth +than non-sequential I/O, so performance is +highly dependent on the workload characteristics +as well as the file system's ability to +capitalize on available sequentiality by +laying out data contiguously on disk. +.LP +I/O benchmarks have a tendency to age poorly. +For example, IOStone +.RN Park90a , +IOBench +.RN Wolman89 , +and the Andrew benchmark +.RN Howard88 +used fixed size datasets, whose size was +significant at the time, but which no longer +measure I/O performance as the data can now +fit in the processor cache of many modern +machines. +.LP +The Andrew benchmark attempts to separately +measure the time to create, write, re-read, +and then delete a large number of files in +a hierarchical file system. +.LP +Bonnie +.RN Bray90 +measures sequential, streaming I/O bandwidth +for a single process, and random I/O latency +for multiple processes. +.LP +Peter Chen developed an adaptive harness for +I/O benchmarking +.RN Chen94a , +which defines I/O load in terms of five parameters, +uniqueBytes, sizeMean, readFrac, seqFrac, and +processNum. The benchmark then explores the +parameter space to measure file system performance +in a scalable fashion. +.NH 1 +Computer Architecture Primer +.LP +A processor architecture is generally defined by its +instruction set, but most computer architectures +incorporate a large number of common building blocks +and concepts, such as registers, arithmetic logic +units, and caches. +.LP +Of necessity, this primer over-simplifies the +many details and variations of specific computer +designs and architectures. For more information, +please see +.RN Hennessy96 . +.TSTART 1 +.so lmbench3_arch.pic +.FEND "Architecture diagram" 1 +.LP +Figure \n[FIGURE] contains a greatly simplified block diagram +of a computer. Various important elements, such as +the I/O bus and devices, have been left out. The +core of the processor are the registers (r0, ..., rn +and f0, ..., fn) and the arithmetic units (ALU and FPU). +In general, the arithmetic units can access data in +registers ''instantly''. Often data must be explicitly +loaded from memory into a register before it can be +manipulated by the arithmetic units. +.LP +The ALU handles integer arithmetic, such as bit +operations (AND, OR, XOR, NOT, and SHIFT) as +well as ADD, MUL, DIV, and MOD. Sometimes there +is specialized hardware to handle one or more +operations, such as a barrel shifter for SHIFT +or a multiplier, and sometimes there is no +hardware support for certain operations, such +as MUL, DIV, and MOD. +.LP +The FPU handles floating point arithmetic. +Sometimes there are separate FPUs for single +and double precision floating point operations. +.NH 2 +Memory Hierarchy +.LP +Nearly all modern, general purpose computers use +virtual memory with phyically addressed caches. +As such, there is typically one or more caches +between the physical memory and the processor, +and virtual-to-physical address translation +occurs between the processor and the top-level +cache. Cache staging and replacement is done +in \fIcache line\fR units, which are typically +several words in length, and caches lower in +the hierarchy sometimes have cache lines which +are larger than those in the higher caches. +.LP +Modern processors usually incorporate at least +an L1 cache on-chip, and some are starting to +also incorporate the L2 cache on-chip. In +addition, most include a translation look-aside +buffer (TLB) on-chip for fast virtual-to-physical +address translation. +.LP +One key element of any cache design is its +replacement strategy. Most caches use either +direct-mapped or set associative caches. In +the first instance any word in physical memory +has exactly one cache line where into which it +may be staged, while set associative caches +allow a given word to be cached into one of a +set of lines. Direct-mapped caches have a +very simple replacement policy: the contents +of the line that is needed is discarded. +Set associative caches usually use LRU or +some variant within each set, so the least +recently used line in the set of possible +cache lines is replaced. The control logic +for direct-mapped caches is much cheaper to +build, but they are generally only as +effective as a set-associative cache half +the size +.RN Hennessy96 . +.LP +Another key element of memory hierarchy design +is the management of dirty data; at what point +are writes passed down the memory hierarchy to +lower caches and main memory? The two basic +policies are write-through and write-back. +A write-through policy means that writes are +immediately passed through the cache to the +next level in the hierarchy, so the lower +levels are updated at the same time as the +cache. A write-back policy means that the +cache line is marked as dirty in the cache, +and only when the line is ejected from the +cache is the data passed down the hierarchy. +Write-through policies are often used in +higher (smaller) caches because multi- +processor systems need to keep a coherent +view of memory and the writes are often +propagated to other processors by \fIsnoopy\fR +caches. +.LP +One often overlooked aspect of cache +performance is cache behavior during +writes. Most cache lines contain +several words, and most instructions +only update the line a word at a time. +This means that when the processor +writes a word to a cache line that is +not present, the cache will read the +line from memory before completing the +write operation. For \*[bcopy]-like +operations this means that the overall +memory bandwidth requirement is actually +two reads and one write per copied word, +rather than the expected read and write. +.LP +Most modern processors now include some form +of prefetch in the memory hierarchy. For +the most part these are simple systems that +can recognize fixed strided accesses through +memory, such as might be seen in many array +operations. However, prefetching systems +appear to be growing in complexity and +capability. +.LP +Additionally, modern memory subsystems can +usually support multiple outstanding requests; +the level of parallelism is usually dependent +on the level of the hierarchy being accessed. +Top-level caches can sometimes support as +many as six or eight outstanding requests, +while main memory can usually support two +outstanding requests. Other elements of +the memory hierarchy, such as the TLB, often +have additional limits on the level of +achievable parallelism in practice.\** +.FS +For example, if the TLB serializes all +TLB misses, and if each memory access +causes a TLB miss, then the memory +accesses will be serialized even if +the data was in a cache supporting +six outstanding requests. +.FE +.LP +For more information and details on memory +subsystem design, and computer architecture +in general, please see +.RN Hennessy96 +which has an excellent description of these +and many other issues. +.NH 1 +Timing Harness +.LP +The first, and most crucial element in extending +\*[lmbench2] so that it could measure scalable +performance, was to develop a new timing harness +that could accurately measure performance for +any given load. +Once this was done, then each benchmark would +be migrated to the new timing harness. +.LP +The harness is designed to accomplish a number +of goals: +.IP 1. +during any timing interval of any child it is +guaranteed that all other child processes are +also running the benchmark +.IP 2. +the timing intervals are long enough to average +out most transient OS scheduler affects +.IP 3. +the timing intervals are long enough to ensure +that error due to clock resolution is negligible +.IP 4. +timing measurements can be postponed to allow +the OS scheduler to settle and adjust to the +load +.IP 5. +the reported results should be representative +and the data analysis should be robust +.IP 6. +timing intervals should be as short as possible +while ensuring accurate results +.LP +Developing an accurate timing harness with a +valid experimental design is more difficult +than is generally supposed. +Many programs incorporate elementary timing +harnesses which may suffer from one or more +defects, such as insufficient care taken to +ensure that the benchmarked operation is run +long enough to ensure that the error introduced +by the clock resolution is insignificant. +The basic elements of a good timing harness +are discussed in +.RN Staelin98 . +.LP +The new timing harness must also collect and process +the timing results from all the child processes so +that it can report the representative performance. +It currently reports the median performance over +all timing intervals from all child processes. It +might perhaps be argued that it should report the +median of the medians. +.LP +Most of the benchmarks now accept a "-P <parallelism>" +flag, and the timing harness does the right thing to +try and measure parallel application performance. +.LP +When running benchmarks with more than one child, +the harness must first get a baseline estimate +of performance by running the benchmark in only +one process using the standard \*[lmbench] timing +interval, which is often 5,000 micro-seconds. +Using this information, the harness can compute +the average time per iteration for a single +process, and it uses this figure to compute the +number of iterations necessary to ensure that +each child runs for at least one second. +.NH 2 +Clock resolution +.LP +\*[lmbench] uses the \*[gettimeofday] clock, whose +interface resolves time down to 1 micro-second. +However, many system clock's resolution is only 10 +milli-seconds, and there is no portable way to query +the system to discover the true clock resolution. +.LP +The problem is that the timing intervals must +be substantially larger than the clock resolution +in order to ensure that the timing error doesn't +impact the results. For example, the true duration +of an event measured with a 10 milli-second clock +can vary $+-$10 milli-seconds from the true time, +assuming that the reported time is always a +truncated version of the true time. If the clock +itself is not updated precisely, the true error +can be even larger. +This implies that timing intervals on these systems +should be at least 1 second. +.LP +However, the \*[gettimeofday] clock resolution in +most modern systems is 1 micro-second, so timing +intervals can as small as a few milli-seconds +without incurring significant timing errors related +to clock resolution. +.LP +Since there is no standard interface to query the operating +system for the clock resolution, \*[lmbench] must +experimentally determine the appropriate timing +interval duration which provides results in a timely +fashion with a negligible clock resolution error. +.NH 2 +Coordination +.LP +Developing a timing harness that correctly manages +$N$ processes and accurately measures system performance +over those same $N$ processes is significantly more difficult +than simply measuring system performance with a single +process because of the asynchronous nature of +parallel programming. +.LP +In essence, the new timing harness needs to create +$N$ jobs, and measure the average performance of the +target subsystem while all $N$ jobs are running. This +is a standard problem for parallel and distributed +programming, and involves starting the child +processes and then stepping through a handshaking +process to ensure that all children have started +executing the benchmarked operation before any child +starts taking measurements. +.TSTART 1 +.TS +box tab (/) allbox expand ; +c c +l l . +Parent/Child +T{ +start up P child processes +T}/T{ +run benchmark operation for a little while +T} +T{ +wait for P "ready" signals +T}/T{ +send a "ready" signal +T} +T{ +[sleep for "warmup" microseconds] +T}/T{ +run benchmark operation while polling for a "go" signal +T} +T{ +send "go" signal to P children +T}/T{ +begin timing benchmark operation +T} +T{ +wait for P "done" signals +T}/T{ +send a "done" signal +T} +T{ +for each child, send "results" signal and gather results +T}/T{ +run benchmark operation while polling for a "results" signal +T} +T{ +collate results +T}/T{ +send timing results and wait for "exit" signal +T} +T{ +send "exit" signal +T}/T{ +exit +T} +.TE +.TEND "Timing harness sequencing" +.LP +Table \n[TABLE] shows how the parent and child +processes coordinate their activities to ensure +that all children are actively running the +benchmark activity while any child could be +taking timing measurements. +.LP +.NH 2 +Accuracy +.LP +The new timing harness also needs to ensure that the +timing intervals are long enough for the results to +be representative. The previous timing harness assumed +that only single process results were important, and +it was able to use timing intervals as short as +possible while ensuring that errors introduced by +the clock resolution were negligible. +In many instances this meant that the timing intervals +were smaller than a single scheduler time slice. +The new timing harness must run benchmarked items +long enough to ensure that timing intervals are longer +than a single scheduler time slice. +Otherwise, you can get results which are complete nonsense. +For example, running several copies of an \*[lmbench2] +benchmark on a uni-processor machine will often report +that the performance with $N$ jobs running in parallel +is equivalent to the performance with a single job running!\** +.FS +This was discovered by someone who naively attempted +to parallelize \*[lmbench2] in this fashion, and I +received a note from the dismayed developer describing +the failed experiment. +.FE +.LP +In addition, since the timing intervals now have to be +longer than a single scheduler time slice, they also +need to be long enough so that a single scheduler time +slice is insignificant compared to the timing interval. +Otherwise the timing results can be dramatically +affected by small variations in the scheduler's +behavior. +.NH 2 +Resource consumption +.LP +One important design goal was that resource consumption +be constant with respect to the number of child +processes. +This is why the harness uses shared pipes to communicate +with the children, rather than having a separate set of +pipes to communicate with each child. +An early design of the system utilized a pair of pipes +per child for communication and synchronization between +the master and slave processes. However, as the number +of child processes grew, the fraction of system +resources consumed by the harness grew and the additional +system overhead could start to interfere with the accuracy +of the measurements. +.LP +Additionally, if the master has to poll (\*[select]) +$N$ pipes, then the system overhead of that operation +also scales with the number of children. +.NH 2 +Pipe atomicity +.LP +Since all communication between the master process and +the slave (child) processes is done via a set of shared +pipes, we have to ensure that we never have a situation +where the message can be garbled by the intermingling +of two separate messages from two separate children. +This is ensured by either using pipe operations that +are guaranteed to be atomic on all machines, or by +coordinating between processes so that at most one +process is writing at a time. +.LP +The atomicity guarantees are provided by having each +client communicate synchronization states in one-byte +messages. For example, the signals from the master +to each child are one-byte messages, so each child +only reads a single byte from the pipe. Similarly, +the responses from the children back to the master +are also one-byte messages. In this way no child +can receive partial messages, and no message can +be interleaved with any other message. +.LP +However, using this design means that we need to +have a separate pipe for each \fIbarrier\fR in +the process, so the master uses three pipes to +send messages to the children, namely: \fIstart_signal\fR, +\fIresult_signal\fR, and \fIexit_signal\fR. +If a single pipe was used for all three barrier events, +then it is possible for a child to miss a signal, +or if the signal is encoded into the message, +then it is possible for a child to infinite loop +pulling a signal off the pipe, recognizing that +it has already received that signal so that it +needs to push it back into the pipe, and then +then re-receiving the same message it just re-sent. +.LP +However, all children share a single pipe to send +data back to the master process. Usually the +messages on this pipe are single-byte signals, +such as \fIready\fR or \fIdone\fR. However, the +timing data results need to be sent from the +children to the master and they are (much) larger +than a single-byte message. In this case, the +timing harness sends a single-byte message on +the \fIresult_signal\fR channel, which can be +received by at most one child process. This +child then knows that it has sole ownership of +the response pipe, and it writes its entire +set of timing results to this pipe. Once the +master has received all of the timing results +from a single child, it sends the next one-byte +message on the \fIresult_signal\fR channel to +gather the next set of timing results. +.TSTART 1 +.so lmbench3_signals.pic +.FEND "Control signals" 1 +.LP +The design of the signals is shown in Figure \n[FIGURE]. +.NH 2 +Benchmark initialization +.LP +By allowing the benchmark to specify an +initialization routine that is run in the +child processes, the new timing harness +allows benchmarks to do either or both +global initializations that are shared +by all children and specific per-child +initializations that are done independently +by each child. +Global initialization is done in the +master process before the \*[benchmp] +harness is called, so the state is +preserved across the \*[fork] operations. +Per-child initialization is done inside +the \*[benchmp] harness by the optional +initialization routine and is done after +the \*[fork] operation. +.LP +Similarly, each benchmark is allowed to +specify a cleanup routine that is run by +the child processes just before exiting. +This allows the benchmark routines to +release any resources that they may have +used during the benchmark. +Most system resources would be automatically +released on process exit, such as file +descriptors and shared memory segments, +but some resources such as temporary files +might need to be explicitly released by +the benchmark. +.NH 2 +Scheduler transients +.LP +Particularly on multi-processor systems, side-effects +of process migration can dramatically affect program +runtimes. For example, if the processes are all +initially assigned to the same processor as the parent +process, and the timing is done before the scheduler +migrates the processes to other available processors, +then the system performance will appear to be that of +a uniprocessor. Similarly, if the scheduler is +over-enthusiastic about re-assigning processes to +processors, then performance will be worse than +necessary because the processes will keep encountering +cold caches and will pay exhorbitant memory access +costs. +.LP +The first case is a scheduler transient, and users +may not want to measure such transient phenomena +if their primary interest is in predicting performance +for long-running programs. Conversely, that same +user would be extraordinarily interested in the +second phenomena. The harness was designed to +allow users to specify that the benchmarked processes +are run for long enough to (hopefully) get the +scheduler past the transient startup phase, so it +can measure the steady-state behavior. +.NH 2 +Data analysis +.LP +Analyzing the data to produce representative results +is a crucial step in the benchmarking process. +\*[lmbench] generally reports the \fImedian\fP +result for $11$ measurements. +Most benchmarks report the results of a single measurement +.RN Howard88 , +an average of several results +.RN McCalpin95 , +or a trimmed mean +.RN Brown97 . +XXX UNKNOWN: +.RN Weicker84,Shein89,Park,Wolman89,Banga97,Saavedra92,Chen94a,Bray90 +.LP +Since \*[lmbench] is able to use timing intervals +that are often smaller than a scheduler time slice, +the raw timing results are often severely skewed. +The median is preferable to the mean when the data +can be very skewed +.RN Jain91 . +.LP +In some instances, however, \*[lmbench] internally +uses the \fIminimum\fP rather than the median, +such as in \*[mhz]. +In those instances, we are not trying to find the +\fIrepresentative\fP value, but rather the +\fIminimum\fP value. +There are only a few sources of error which could +cause a the measured timing result to be shorter +than the true elapsed time: the system clock is +adjusted, or round-off error in the clock resolution. +The timing interval duration is set to ensure that +the round-off error is bounded to 1% of the timing +interval, and we blithely assume that people don't +reset their system clocks while benchmarking their +systems. +.LP +\*[lmbench] does not currently report any statistics +representing measurement variation, such as the +difference between the first and third quartiles. +.NH 1 +Interface +.LP +Unfortunately we had to move away from the +macro-based timing harness used in \*[lmbench2] +and migrate to a function-based system. +.LP +The new interface looks like: +.DS +typedef void (*bench_f)(uint64 iterations, + void* cookie); +typedef void (*support_f)(void* cookie); + +extern void benchmp(support_f initialize, + bench_f benchmark, + support_f cleanup, + int enough, + int parallel, + int warmup, + int repetitions, + void* cookie); +.DE +.LP +A brief description of the parameters: +.IP \fIenough\fR +Enough can be used to ensure that a timing interval is at +least 'enough' microseconds in duration. For most benchmarks +this should be zero, but some benchmarks have to run for more +time due to startup effects or other strange behavior. +.IP \fIparallel\fR +is simply the number of instances of the benchmark +that will be run in parallel on the system. +.IP \fIwarmup\fR +can be used to force the benchmark to run for warmup +microseconds before the system starts making timing measurements. +Note that it is a lower bound, not a fixed value, since it +is simply the time that the parent sleeps after receiving the +last "ready" signal from each child (and before it sends +the "go" signal to the children). +.IP \fIrepetitions\fR +is the number of times the experiment should +be repeated. The default is eleven. +.IP \fIcookie\fR +is a pointer that can be used by the benchmark +writer to pass in configuration information, such as buffer +size or other parameters needed by the inner loop. +In \*[lmbench3] it is generally used to point +to a structure containing the relevant configuration +information. +.LP +To write a simple benchmark for getppid() all you would need +to do is: +.DS +void +benchmark_getppid(uint64 iterations, + void* cookie) +{ + while (iterations-- > 0) { + getppid(); + } +} +.DE +.LP +and then somewhere in your program you might call: +.DS +benchmp(NULL, benchmark_getppid, NULL, + 0, 1, 0, NULL); +micro("getppid", get_n()); +.DE +.LP +A more complex example which has "state" and uses the +initialization and cleanup capabilities might look something +like this: +.DS +struct bcopy_state { + int len; + char* src; + char* dst; +}; +.DE +.DS +void +initialize_bcopy(void* cookie) +{ + struct bcopy_state* state = + (struct bcopy_state*)cookie; + + state->src = valloc(state->len); + state->dst = valloc(state->len); + + bzero(src, state->len); + bzero(src, state->len); +} +.DE +.DS +void +benchmark_bcopy(uint64 iterations, + void* cookie) +{ + struct bcopy_state* state = + (struct bcopy_state*)cookie; + + while (iterations-- > 0) { + bcopy(state->src, + state->dst, state->len); + } +} +.DE +.DS +void +cleanup_bcopy(void* cookie) +{ + struct bcopy_state* state = + (struct bcopy_state*)cookie; + + free(state->src); + free(state->dst); +} +.DE +.LP +and then your program look something like: +.DS +#include "bench.h" +int +main() +{ + struct bcopy_state state; + + state.len = 8 * 1024 * 1024; + benchmp(initialize_bcopy, + benchmark_bcopy, + cleanup_bcopy, + 0, 1, 0, TRIES, &state); + fprintf(stderr, "bcopy: "); + mb(state.len * get_n()); + exit(0); +} +.DE +.LP +Note that this particular micro-benchmark would measure +cache-to-cache \*[bcopy] performance unless the amount of +memory being copied was larger than half the cache size. +A slightly more sophisticated approach might allocate +as much memory as possible and then \*[bcopy] from one +segment to another, changing segments within the allocated +memory before each \*[bcopy] to defeat the caches. +.NH 1 +Benchmarks +.LP +\*[lmbench] contains a large number of micro-benchmarks +that measure various aspects of hardware and operating +system performance. The benchmarks generally measure +latency or bandwidth, but some new benchmarks also +measure parallelism. +.TSTART +.TS +center box tab (&); +c c +l & l . +Name&Measures +_ +&Bandwidth +bw_file_rd&T{ +\*[read] and then load into processor +T} +bw_mem&T{ +read, write, and copy data to/from memory +T} +bw_mmap_rd&read from \*[mmap]'ed memory +bw_pipe&\*[pipe] inter-process data copy +bw_tcp&TCP inter-process data copy +bw_unix&UNIX inter-process +_ +&Latency +lat_connect&TCP socket connection +lat_ctx&T{ +context switch via \*[pipe]-based ``hot-potato'' token passing +T} +lat_fcntl&\*[fcntl] operation +lat_fifo&T{ +FIFO ``hot-potato'' token passing +T} +lat_fs&file creation and deletion +lat_http&http GET request latency +lat_mem_rd&memory read +lat_mmap&\*[mmap] operation +lat_ops&basic operations +lat_pagefault&page fault handler +lat_pipe&\*[pipe] ``hot-potato'' token passing +lat_proc&T{ +procedure call overhead and process creation using \*[fork], +\*[fork] and \*[execve], and \*[fork] and \*[sh] +T} +lat_rpc&SUN RPC procedure call +lat_select&\*[select] +lat_sem&T{ +semaphore ``hot-potato'' token passing +T} +lat_sig&T{ +signal handle installation and handling +T} +lat_syscall&\*[getppid], \*[write], \*[stat], \*[fstat], \*[open], \*[close] +lat_tcp&TCP ``hot-potato'' token passing +lat_udp&UDP ``hot-potato'' token passing +lat_unix&UNIX ``hot-potato'' token passing +lat_unix_connect&UNIX socket connection +_ +&Parallelism +par_mem&memory subsystem +par_ops&T{ +instruction-level parallelism of basic arithmetic operations +T} +_ +mhz&CPU clock frequency +line&cache line size +tlb&number of pages mapped by TLB +stream&STREAM clones +lmdd&\fIdd\fR clone +.TE +.TEND "\*[lmbench] micro-benchmarks" +.LP +Table \n[TABLE] contains the full list of micro-benchmarks +in \*[lmbench]. +.NH 2 +Bandwidth +.LP +.LP +By bandwidth, we mean the rate at which a particular facility can move +data. +We attempt to measure the data movement ability of a number of +different facilities: +library \*[bcopy], +hand-unrolled \*[bcopy], +direct-memory read and write (no copying), +pipes, +TCP sockets, +the \*[read] interface, +and +the \*[mmap] interface. +.NH 2 +Memory bandwidth +.LP +Data movement is fundamental to any operating system. +In the past, performance +was frequently measured in MFLOPS because floating point units were +slow enough that microprocessor systems were +rarely limited by memory bandwidth. Today, floating point units are usually much +faster than memory bandwidth, so many current MFLOP ratings can not be +maintained using memory-resident data; they are ``cache only'' ratings. +.LP +We measure the ability to +copy, read, and write data over a varying set of sizes. +There are too many results to report all of them here, so we concentrate on +large memory transfers. +.LP +We measure copy bandwidth two ways. The first is the user-level library +\*[bcopy] interface. +The second is a hand-unrolled loop that loads and stores +aligned 8-byte words. +In both cases, we took care to +ensure that the source and destination locations would not map to the same +lines if the any of the caches were direct-mapped. +In order to test memory bandwidth rather than cache bandwidth, +both benchmarks copy an 8M\** area to another 8M area. +(As secondary caches reach 16M, these benchmarks will have to +be resized to reduce caching effects.) +.FS +Some of the PCs had less than 16M of available memory; +those machines copied 4M. +.FE +.LP +The copy results actually represent one-half to one-third of the memory +bandwidth used to obtain those results since we are reading and writing +memory. If the cache line size is larger than the word stored, then +the written cache line will typically be read before it is written. The +actual amount of memory bandwidth used varies because some architectures +have special instructions specifically designed for the \*[bcopy] +function. Those architectures will move twice as much memory as +reported by this benchmark; less advanced architectures move three +times as much memory: the memory read, the memory read because it is +about to be overwritten, and the memory written. +.LP +The \*[bcopy] results reported in Table 2 +may be correlated with John McCalpin's \*[stream] +.RN McCalpin95 +benchmark results in the following manner: +the \*[stream] benchmark reports all of the memory moved +whereas the \*[bcopy] benchmark reports the bytes copied. So our +numbers should be approximately one-half to one-third of his numbers. +.LP +Memory reading is measured by an unrolled loop that sums up a series of +integers. On most (perhaps all) systems measured the integer +size is 4 bytes. The loop is unrolled such that most compilers generate +code that uses a constant offset with the load, resulting in a load and +an add for each word of memory. The add is an integer add that completes +in one cycle on all of the processors. Given that today's processor +typically cycles at 10 or fewer nanoseconds (ns) and that memory is typically 200-1,000 +ns per cache line, the results reported here should be dominated by the +memory subsystem, not the processor add unit. +.LP +The memory contents are added up because almost all C compilers +would optimize out the whole loop when optimization was turned on, and +would generate far too many instructions without optimization. +The solution is to +add up the data and pass the result as an unused argument to the +``finish timing'' function. +.LP +Memory reads represent about one-third to one-half of the \*[bcopy] work, and we expect +that pure reads should run at roughly twice the speed of \*[bcopy]. +Exceptions to this rule should be studied, for exceptions indicate a bug +in the benchmarks, a problem in \*[bcopy], or some unusual hardware. +.TSTART +.so bw_allmem.tbl +.TEND "Memory bandwidth (MB/s)" +.LP +Memory writing is measured by an unrolled loop that stores a value into +an integer (typically a 4 byte integer) and then increments the pointer. +The processor cost of each memory operation is approximately the same +as the cost in the read case. +.LP +The numbers reported in Table \n[TABLE] +are not the raw hardware speed in some cases. +The Power2\** is capable of up to 800M/sec read rates +.FS +Someone described this machine as a $1,000 processor on a $99,000 memory +subsystem. +.FE +.RN McCalpin95 +and HP PA RISC (and other prefetching) +systems also do better if higher levels of code optimization used +and/or the code is hand tuned. +.LP +The Sun libc bcopy in Table \n[TABLE] +is better because they use a hardware specific bcopy +routine that uses instructions new in SPARC V9 that were added specifically +for memory movement. +.LP +The Pentium Pro read rate in Table \n[TABLE] is much higher than the write rate because, +according to Intel, the write transaction turns into a read followed by +a write to maintain cache consistency for MP systems. +.NH 2 +IPC bandwidth +.LP +Interprocess communication bandwidth is frequently a performance issue. +Many Unix applications are composed of several processes communicating +through pipes or TCP sockets. Examples include the \f(CWgroff\fP documentation +system that prepared this paper, the \f(CWX Window System\fP, remote file access, +and \f(CWWorld Wide Web\fP servers. +.LP +Unix pipes are an interprocess communication mechanism implemented as +a one-way byte stream. Each end of the stream has an associated file +descriptor; one is the write descriptor and the other the read +descriptor. +TCP sockets are similar +to pipes except they are bidirectional and can cross machine +boundaries. +.LP +Pipe bandwidth is measured by creating two processes, a writer and a +reader, which transfer 50M of data in 64K transfers. +The transfer size was chosen so that the overhead of system calls +and context switching would not dominate the benchmark time. +The reader prints the timing results, which guarantees that all +data has been moved before the timing is finished. +.LP +TCP bandwidth is measured similarly, except the data is transferred in +1M page aligned transfers instead of 64K transfers. If the TCP +implementation supports it, the send and receive socket buffers are +enlarged to 1M, instead of the default 4-60K. We have found that +setting the transfer size equal to the socket buffer size produces the +greatest throughput over the most implementations. +.TSTART +.so bw_ipc.tbl +.TEND "Pipe and local TCP bandwidth (MB/s)" +.LP +\*[bcopy] is important to this test because the +pipe write/read is typically implemented as a \*[bcopy] into the kernel +from the writer and then a \*[bcopy] from the kernel to the reader. +Ideally, these results would be approximately one-half of the +\*[bcopy] results. It is possible for the kernel \*[bcopy] +to be faster than the C library \*[bcopy] since the kernel may have +access to \*[bcopy] hardware unavailable to the C library. +.LP +It is interesting to compare pipes with TCP because the TCP benchmark is +identical to the pipe benchmark except for the transport mechanism. +Ideally, the TCP bandwidth would be as good as the pipe +bandwidth. It is not widely known that the +majority of the TCP cost is in the \*[bcopy], the checksum, +and the network interface driver. +The checksum and the driver may be safely eliminated in the loopback +case and if the costs have been eliminated, then TCP should be just as +fast as pipes. From the pipe and TCP results in Table \n[TABLE], it is easy to +see that Solaris and HP-UX have done this optimization. +.LP +Bcopy rates in Table \n[TABLE] can be lower than pipe rates because the +pipe transfers are done in 64K buffers, a size that frequently fits in +caches, while the bcopy is typically an 8M-to-8M copy, which does not +fit in the cache. +.LP +In Table \n[TABLE], the SGI Indigo2, a uniprocessor, does better than +the SGI MP on pipe bandwidth because of caching effects - in the UP +case, both processes share the cache; on the MP, each process is +communicating with a different cache. +.LP +All of the TCP results in Table \n[TABLE] are in loopback mode \(em that +is both ends of the socket are on the same machine. It was impossible +to get remote networking results for all the machines included in this +paper. We are interested in receiving more results for identical +machines with a dedicated network connecting them. The results we have +for over the wire TCP bandwidth are shown below. +.TSTART +.so bw_tcp.tbl +.TEND "Remote TCP bandwidth (MB/s)" +.LP +The SGI using 100MB/s Hippi is by far the fastest in Table \n[TABLE]. +The SGI Hippi interface has hardware support for TCP checksums and +the IRIX operating system uses virtual memory tricks to avoid copying +data as much as possible. +For larger transfers, SGI Hippi has reached 92MB/s over TCP. +.LP +100baseT is looking quite competitive when compared to FDDI in Table +\n[TABLE], even though FDDI has packets that are almost three times +larger. We wonder how long it will be before we see gigabit ethernet +interfaces. +.NH 2 +Cached I/O bandwidth +.LP +Experience has shown us that reusing data in the file system +page cache can be a performance issue. This +section measures that operation through two interfaces, \*[read] and +\*[mmap]. +The benchmark here is not an I/O benchmark in that no disk activity is +involved. +We wanted to measure the overhead +of reusing data, an overhead that is CPU intensive, rather than disk intensive. +.LP +The \*[read] interface copies data from the kernel's file system page cache into the +process's buffer, using 64K buffers. The transfer size was chosen +to minimize the kernel entry overhead while +remaining realistically sized. +.LP +The difference between the \*[bcopy] and the \*[read] benchmarks +is the cost of the file and virtual memory system overhead. In most +systems, the \*[bcopy] speed should be faster than the \*[read] speed. The +exceptions usually have hardware specifically designed +for the \*[bcopy] function and that hardware may be available only to +the operating system. +.LP +The \*[read] benchmark is implemented by rereading a file +(typically 8M) in 64K +buffers. Each buffer is summed as a series of integers in the user +process. The summing is done for two reasons: for an apples-to-apples +comparison the memory-mapped benchmark needs to touch all the data, +and the file system can sometimes transfer data into memory faster than the +processor can read the data. +For example, \s-1SGI\s0's XFS can move data into memory at +rates in excess of 500M per second, but it can move data into +the cache at only 68M per second. The intent is to measure performance +delivered to the application, not DMA performance to memory. +.TSTART +.so bw_reread2.tbl +.TEND "File vs. memory bandwidth (MB/s)" +.LP +The \*[mmap] interface provides a way to access the kernel's file cache +without copying the data. +The \*[mmap] benchmark is implemented by mapping the entire file (typically 8M) +into the +process's address space. The file is then summed to force the data +into the cache. +.LP +In Table \n[TABLE], +a good system will have \fIFile read\fP as fast as (or even faster than) +\fILibc bcopy\fP because as the file system overhead goes to zero, the +file reread case is virtually the same as the library \*[bcopy] case. +However, file reread can be faster because the kernel may have access to +\*[bcopy] assist hardware not available to the C library. +Ideally, \fIFile mmap\fP performance should approach \fIMemory read\fP +performance, but \*[mmap] is often dramatically worse. +Judging by the results, this looks to be a +potential area for operating system improvements. +.LP +In Table \n[TABLE] the Power2 does better on file reread than bcopy because it takes +full advantage of the memory subsystem from inside the kernel. +The mmap reread is probably slower because of the lower clock rate; +the page faults start to show up as a significant cost. +.LP +It is surprising that the Sun Ultra1 was able to bcopy at the high +rates shown in Table 2 but did not show those rates for file reread +in Table \n[TABLE]. +HP has the opposite problem, they get file reread faster than bcopy, +perhaps because the kernel \*[bcopy] has access to hardware support. +.LP +The Unixware system has outstanding mmap reread rates, better than +systems of substantially higher cost. Linux needs to do some work on +the \f(CWmmap\fP code. +.NH 2 +Latency +.LP +Latency is an often-overlooked +area of performance problems, possibly because resolving latency issues +is frequently much harder than resolving bandwidth issues. For example, +memory bandwidth may be increased by making wider cache lines and increasing +memory ``width'' and interleave, +but memory latency can be improved only by shortening paths or increasing +(successful) prefetching. +The first step toward improving latency is understanding the +current latencies in a system. +.LP +The latency measurements included in this suite are +memory latency, +basic operating system entry cost, +signal handling cost, +process creation times, +context switching, +interprocess communication, +.\" virtual memory system latency, +file system latency, +and disk latency. +.NH 2 +Memory read latency background +.LP +In this section, we expend considerable effort to define the different memory +latencies and to explain and justify our benchmark. +The background is a bit tedious but important, since we believe the +memory +latency measurements to be one of the most thought-provoking and useful +measurements in \*[lmbench]. +.LP +The most basic latency measurement is memory latency since most of +the other latency measurements can be expressed in terms of memory +latency. For example, context switches require saving the current +process state and loading the state of the next process. However, memory +latency is rarely accurately measured and frequently misunderstood. +.LP +Memory read latency has many definitions; +the most common, +in increasing time order, +are memory chip cycle time, processor-pins-to-memory-and-back time, +load-in-a-vacuum time, and back-to-back-load time. +.BU "Memory chip cycle latency" : +Memory chips are rated in nanoseconds; typical speeds are around 60ns. +A general overview on DRAM architecture may be found in +.RN Hennessy96 . +The +specific information we describe here is from +.RN Toshiba94 +and pertains to the \s-1THM361020AS-60\s0 module and \s-1TC514400AJS\s0 +\s-1DRAM\s0 used in \s-1SGI\s0 workstations. The 60ns time is the +time from +.ps -1 +.nr width \w'R\&A\&S' +.nr height \n[rst]+1000 +RAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u' +.ps +assertion to the when +the data will be available on the \s-1DRAM\s0 pins (assuming +.ps -1 +.nr width \w'C\&A\&S' +.nr height \n[rst]+1000 +CAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u' +.ps +access time requirements were met). +While it is possible +to get data out of a \s-1DRAM\s0 in 60ns, that is not all of +the time involved. There is a precharge time that must occur after +every access. +.RN Toshiba94 +quotes 110ns as the random read or write cycle time and this +time is more representative of the cycle time. +.\" For example, most systems offer a wide range of memory +.\" capacity, from 64MB to 1GB or more. If 64MB simms are used, the number +.\" of simms range from 1 to 16. The more simms there are, the more +.\" capacitance there is in the memory subsystem. More capacitance means +.\" longer setup times for the fully populated memory subsystem. System +.\" designers have to allow time for this setup. +.\" For more details, consult [XXX - reference on DRAM]. +.\" This is sometimes referred to as the chip latency. The +.\" chip cycle time is the chip latency plus the time required to restore +.\" the data in the capacitors which is often referred to as the precharge +.\" time. This means that 60 nanosecond memory chips really are more like +.\" 100 nanosecond memory chips. Some systems operate memory in ``page +.\" mode'' or ``static column'' memory systems hold either RAS or CAS and +.\" allow subsequent accesses in the same row or column in one cycle instead +.\" of two. +.BU "Pin-to-pin latency" : +This number represents the time needed +for the memory request to travel from the processor's pins to the memory +subsystem and back again. Many vendors have used the pin-to-pin +definition of memory latency in their reports. For example, +.RN Fenwick95 +while describing the \s-1DEC\s0 8400 +quotes memory latencies of 265ns; a careful +reading of that paper shows that these are pin-to-pin numbers. In spite +of the historical precedent in vendor reports, this definition of memory +latency is misleading since it ignores actual delays seen when a load +instruction is immediately followed by a use of the data being loaded. +The number of additional cycles inside the processor can be significant +and grows more significant with today's highly pipelined architectures. +.LP +It is worth noting that the pin-to-pin numbers +include the amount of time it takes to charge +the lines going to the \s-1SIMM\s0s, a time that increases with the +(potential) number of \s-1SIMM\s0s in a system. More \s-1SIMM\s0s mean +more capacitance which requires in longer charge times. This is one reason +why personal computers frequently have better memory latencies than +workstations: the PCs typically have less memory capacity. +.BU "Load-in-a-vacuum latency" : +A load in a vacuum is the time that the processor will wait for one load that +must be fetched from main memory (i.e., a cache miss). The ``vacuum'' +means that there is no other activity on the system bus, including no other +loads. +While this number is frequently used as the memory latency, it is not very +useful. It is basically a ``not to exceed'' number important only for +marketing reasons. +Some architects point out that since most processors implement nonblocking +loads (the load does not cause a stall until the data is used), the perceived +load latency may be much less that the real latency. When pressed, however, +most will admit that cache misses occur in bursts, resulting in perceived +latencies of at least the load-in-a-vacuum latency. +.BU "Back-to-back-load latency" : +Back-to-back-load latency is the time that each load takes, assuming +that the instructions before and after are also cache-missing loads. +Back-to-back loads may take longer than loads in a vacuum for the +following reason: many systems implement something known as \fIcritical +word first\fP, which means that the subblock of the cache line that +contains the word being loaded is delivered to the processor before the +entire cache line has been brought into the cache. If another load +occurs quickly enough after the processor gets restarted from the +current load, the second load may stall because the cache is still +busy filling the cache line for the previous load. On some systems, +such as the current implementation of UltraSPARC, +the difference between back to back and load in a vacuum is about 35%. +.LP +\*[lmbench] measures back-to-back-load latency because it is the +only measurement that may be easily measured from software and +because we feel that it is what most software developers consider to be memory +latency. Consider the following C code fragment: +.DS +.nf +.ft CW +p = head; +while (p->p_next) + p = p->p_next; +.ft +.fi +.DE +On a \s-1DEC\s0 Alpha, the loop part turns into three instructions, including the +load. A 300 Mhz processor has a 3.33ns cycle time, so the loop +could execute in slightly less than 10ns. However, the load itself +takes 400ns on a 300 Mhz \s-1DEC\s0 8400. In other words, the +instructions cost 10ns but the load stalls for 400. Another +way to look at it is that 400/3.3, or 121, nondependent, +nonloading instructions following the load would be needed +to hide the load latency. +Because superscalar processors typically execute multiple operations +per clock cycle, they need even more useful operations between cache +misses to keep the processor from stalling. +.LP +This benchmark illuminates the tradeoffs in processor cache design. +Architects like large cache lines, up to 64 bytes or so, because +the prefetch effect of gathering a whole line increases +hit rate given reasonable spatial locality. +Small stride sizes have high spatial locality and should have higher +performance, but large stride sizes have poor spatial locality causing +the system to prefetch useless data. +So the benchmark provides the following insight into negative +effects of large line prefetch: +.BU +Multi-cycle fill operations are typically atomic events at the +caches, and sometimes block other cache accesses until they +complete. +.BU +Caches are typically single-ported. Having a large line prefetch +of unused data causes extra bandwidth +demands at the cache, and can cause increased access latency for +normal cache accesses. +.LP +In summary, we believe that processors are so fast that the average +load latency for cache misses will be closer to the +back-to-back-load number than to the load-in-a-vacuum number. We are +hopeful that the industry will standardize on this definition of +memory latency. +.NH 2 +Memory read latency +.LP +The entire memory hierarchy can be measured, including on-board data +cache latency and size, external data cache latency and size, and +main memory latency. +Instruction caches are not measured. +TLB miss latency can also be measured, as in +.RN Saavedra92 , +but we stopped at main memory. Measuring TLB miss time is problematic +because different systems map different amounts of memory with their +TLB hardware. +.LP +The benchmark varies two parameters, array size and array stride. +For each size, a list of pointers is created for all of the different +strides. Then the list is walked thus: +.DS +.ft CW +mov r4,(r4) # C code: p = *p; +.ft +.DE +The time to do about 1,000,000 loads (the list wraps) is measured and +reported. The time reported is pure latency time and may be zero even though +the load instruction does not execute in zero time. Zero is defined as one +clock cycle; in other words, the time reported is \fBonly\fP memory latency +time, as it does not include the instruction execution time. It is assumed +that all processors can do a load instruction in one processor cycle +(not counting stalls). In other words, if the processor cache load time +is 60ns on a 20ns processor, the load latency reported +would be 40ns, the additional 20ns is for the load instruction +itself.\** +.FS +In retrospect, this was a bad idea because we calculate the clock +rate to get the instruction execution time. If the clock rate is off, +so is the load time. +.FE +Processors that can manage to get the load address out to the +address pins before the end of the load cycle get some free time in this +benchmark (we don't know of any processors that do that). +.LP +This benchmark has been validated by logic analyzer measurements +on an \s-1SGI\s0 Indy by Ron Minnich while he was at the Maryland Supercomputer +Research Center. +.TSTART 1 +.so mem.pic +.FEND "Memory latency" 1 +.LP +Results from the memory latency benchmark are plotted as a series of data sets +as shown in Figure \n[FIGURE]. +Each data set represents a stride size, +with the array size varying from 512 bytes up to 8M or more. +The curves contain a series of +horizontal plateaus, where each plateau represents a level in the +memory hierarchy. +The point where each plateau ends and the line rises marks the +end of that portion of the memory hierarchy (e.g., external cache). +Most machines have similar memory hierarchies: +on-board cache, external cache, main memory, and main memory plus TLB +miss costs. +There are variations: some processors are missing a cache, while +others add another cache to the hierarchy. +.\" XXX Larry please double-check this; I am going on dim memory... +For example, the Alpha 8400 has two on-board caches, one 8K +and the other 96K. +.LP +The cache line size can be derived by comparing curves and noticing which +strides are faster than main memory times. The smallest stride that is +the same as main memory speed is likely to be the cache line size because +the strides that are faster than memory are +getting more than one hit per cache line. +.\" Prefetching may confuse +.\" the issue because a demand read may stall behind a prefetch load, +.\" causing cache lines to appear twice as large as they are. +.\" XXX +.\" Larry --- can we use prime modulus arithmetic to set up pointer +.\" loops which might appear random but which really aren't and which +.\" hit every stride once before looping? +.\" +.\" XXX +.\" Larry --- is there any way we can defeat/disable prefetching +.\" so the cache line size can be more accurately determined? +.\" +.\" XXX +.\" Larry --- can we create a benchmark for TLB misses? +.\" I think it was Tom Rokicki who suggested that we create a +.\" benchmark where the data fits in the cache, but the pages don't +.\" fit in the TLB. +.\" +.\" XXX +.\" Larry --- is the description of the memory hierarchy correct? +.\" I am not sure I haven't added an extra level of external cache... +.EQ +delim $$ +.EN +.LP +Figure \n[FIGURE] shows memory latencies on a nicely made machine, +a \s-1DEC\s0 Alpha. +We use this machine as the example +because it shows the latencies and sizes of +the on-chip level 1 and motherboard level 2 caches, and because it +has good all-around numbers, especially considering it can support a +4M level 2 cache. +The on-board cache is $2 sup 13$ bytes or 8K, while the +external cache is $2 sup 19$ bytes or 512K. +.EQ +delim off +.EN +.TSTART +.so lat_allmem.tbl +.TEND "Cache and memory latency (ns)" +.nr MEMTABLE \n[TABLE] +.LP +Table \n[TABLE] shows the cache size, cache latency, and main memory +latency as extracted from the memory latency graphs. +The graphs and the tools for extracting the data are +included with \*[lmbench]. +It is worthwhile to plot all of the graphs and examine them since the +table is missing some details, such as the +\s-1DEC\s0 Alpha 8400 processor's second 96K on-chip cache. +.LP +We sorted Table \n[TABLE] on level 2 cache latency because we think +that many applications will fit in the level 2 cache. The HP and IBM +systems have only one level of cache so we count that as both level 1 +and level 2. Those two systems have remarkable cache performance for +caches of that size. In both cases, the cache delivers data in one +clock cycle after the load instruction. +.LP +HP systems usually focus on +large caches as close as possible to the processor. A older HP +multiprocessor system, the 9000/890, has a 4M, split I&D, direct mapped +cache with a 2K victim cache, accessible in one clock (16ns).\** That system is +primarily a database server. +.FS +The Usenix version of this paper had this as a set associate cache; that was +incorrect. +.FE +.LP +The IBM focus is on low latency, high +bandwidth memory. The IBM memory subsystem is good because all of +memory is close to the processor, but has the weakness that it is +extremely difficult to evolve the design to a multiprocessor system. +.LP +The 586 and PowerPC motherboards have quite poor second level caches, +the caches are not substantially better than main memory. +.LP +The Pentium Pro and Sun Ultra second level caches are of medium speed +at 5-6 clocks latency each. 5-6 clocks seems fast until it is compared +against the HP and IBM one cycle latency caches of similar size. +Given the tight integration of the Pentium Pro level 2 cache, it is +surprising that it has such high latencies. +.LP +The 300Mhz DEC Alpha has a rather high 22 clock latency to the second +level cache which is probably one of the reasons that they needed a 96K +level 1.5 cache. SGI and DEC have used large second level caches +to hide their long latency from main memory. +.LP +.NH 2 +Operating system entry +.LP +Entry into the operating system is required for many system facilities. +When calculating the cost of a facility, it is useful to know how +expensive it is to perform a nontrivial entry into the operating system. +.LP +We measure nontrivial entry into the system by repeatedly writing one +word to \f(CW/dev/null\fP, a pseudo device driver that does nothing but +discard the data. This particular entry point was chosen because it has +never been optimized in any system that we have measured. Other entry +points, typically \*[getpid] and \*[gettimeofday], are heavily used, +heavily optimized, and sometimes implemented as user-level library +routines rather than system calls. +A write to the \f(CW/dev/null\fP driver will go +through the system call table to \*[write], verify the user area as +readable, look up the file descriptor to get the vnode, call the vnode's +write function, and then return. +.TSTART +.so lat_nullsys.tbl +.TEND "Simple system call time (microseconds)" +.LP +Linux is the clear winner in the system call time. The reasons are +twofold: Linux is a uniprocessor operating system, without any +MP overhead, and Linux is a small operating system, without all +of the ``features'' accumulated by the commercial offers. +.LP +Unixware and Solaris are doing quite well, given that they are both fairly +large, commercially oriented operating systems with a large accumulation +of ``features.'' +.NH 2 +Signal handling cost +.LP +Signals in Unix are a way to tell another process to handle an event. They +are to processes as interrupts are to the CPU. +.LP +Signal handling is often critical to layered systems. Some applications, +such as databases, software development environments, and threading libraries +provide an operating system-like layer on top of the operating system, +making signal handling a critical path in many of these applications. +.LP +\*[lmbench] measure both signal installation and signal dispatching in two separate +loops, within the context of one process. +It measures signal handling by installing a signal handler and then repeatedly +sending itself the signal. +.TSTART +.so lat_signal.tbl +.TEND "Signal times (microseconds)" +.LP +Table \n[TABLE] shows the signal handling costs. +Note that there are no context switches in this benchmark; the signal goes +to the same process that generated the signal. In real applications, +the signals usually go to another process, which implies +that the true cost of sending that signal is the signal overhead plus the +context switch overhead. We wanted to measure signal and context +switch overheads separately since context +switch times vary widely among operating systems. +.LP +SGI does very well on signal processing, +especially since their hardware is of an older generation than +many of the others. +.LP +The Linux/Alpha signal handling numbers are so poor +that we suspect that this is a bug, especially given that the Linux/x86 +numbers are quite reasonable. +.NH 2 +Process creation costs +.LP +Process benchmarks are used to measure the basic process primitives, +such as creating a new process, running a different program, and context +switching. Process creation benchmarks are of particular interest +in distributed systems since many remote operations include the creation +of a remote process to shepherd the remote operation to completion. +Context switching is important for the same reasons. +.BU "Simple process creation" . +The Unix process creation primitive is \*[fork], which +creates a (virtually) exact copy of the calling process. +Unlike VMS and some other operating systems, Unix starts any new process +with a \*[fork]. +Consequently, \*[fork] and/or \f(CWexecve\fP should be fast and +``light,'' facts that many have been ignoring for some time. +.LP +\*[lmbench] measures simple process creation by creating a process +and immediately +exiting the child process. The parent process waits for the child +process to exit. +The benchmark is intended to measure the overhead for creating a +new thread of control, so it includes the \*[fork] and +the \*[exit] time. +.LP +The benchmark also includes a \f(CWwait\fP system call in the parent and +context switches from the parent to the child and back again. Given that +context switches of this sort are on the order of 20 microseconds and a +system call is on the order of 5 microseconds, and that the entire benchmark +time is on the order of a millisecond or more, the extra overhead +is insignificant. +Note that even this relatively simple task is very expensive and is +measured in milliseconds while most of the other operations we consider are +measured in microseconds. +.BU "New process creation" . +The preceding benchmark did not create a new application; it created a +copy of the old application. This benchmark measures the cost of creating a +new process and changing that process into a new application, which. +forms the basis of every Unix command +line interface, or shell. +\*[lmbench] measures this facility by forking a new child and having that child +execute a new program \(em in this case, a tiny program that prints +``hello world'' and exits. +.LP +The startup cost is especially noticeable +on (some) systems that have shared libraries. Shared libraries can +introduce a substantial (tens of milliseconds) startup cost. +.\" XXX - statically linked example? +.TSTART +.so lat_allproc.tbl +.TEND "Process creation time (milliseconds)" +.BU "Complicated new process creation" . +When programs start other programs, they frequently use one of +three standard interfaces: \*[popen], \*[system], and/or \*[execlp]. The first +two interfaces start a new process by invoking the standard command +interpreter, \f(CW/bin/sh\fP, to start the process. Starting programs this way +guarantees that the shell will look for the requested application +in all of the places that the user would look \(em in other words, the shell +uses the user's $PATH variable as a list of places to find the +application. \*[execlp] is a C library routine which also looks for the +program using the user's $PATH variable. +.LP +Since this is a common way of starting applications, we felt it +was useful to show the costs of the generality. +.LP +We measure this by starting \f(CW/bin/sh\fP to start the same tiny +program we ran in the last case. +In Table \n[TABLE] the cost of asking the shell to go +look for the program is +quite large, frequently ten times as expensive as just creating a +new process, and four times as expensive as explicitly naming the location +of the new program. +.LP +The results that stand out in Table \n[TABLE] are the poor Sun Ultra 1 results. +Given that the processor is one of the fastest, the problem is likely to be +software. There is room for substantial improvement in the Solaris +process creation code. +.NH 2 +Context switching +.LP +Context switch time is defined here as +the time needed to save the state of one process and restore the state +of another process. +.LP +Context switches are frequently in the critical performance path of +distributed applications. For example, the multiprocessor versions +of the IRIX operating system use +processes to move data through the networking stack. This means that the +processing time for each new packet arriving at an idle system includes +the time needed to switch in the networking process. +.LP +Typical context switch benchmarks measure just the minimal context switch +time \(em the time to switch between two processes that are doing nothing +but context switching. We feel that this is +misleading because there are frequently more than two active processes, +and they usually have a larger working set (cache footprint) +than the benchmark processes. +.LP +Other benchmarks frequently include the cost of +the system calls needed to force the context switches. +For example, Ousterhout's context switch benchmark +measures context switch time plus a \*[read] and a \*[write] +on a pipe. +In many of the systems measured by \*[lmbench], the pipe overhead +varies between 30% and 300% of the context switch time, so we were +careful to factor out the pipe overhead. +.BU "Number of processes." +The context switch benchmark is implemented as +a ring of two to twenty processes that are connected with Unix pipes. +A token is passed from process to process, forcing context switches. +The benchmark measures the time needed to pass +the token two thousand times from process to process. +Each transfer of the token has two costs: the context switch, and +the overhead of passing the token. +In order to calculate just the context switching time, the benchmark first +measures the cost of passing the token through a ring of pipes in a +single process. This overhead time is defined as the cost of passing +the token and is not included in the reported context switch time. +.BU "Size of processes." +In order to measure more realistic context switch times, we add +an artificial variable size ``cache footprint'' to the switching +processes. The cost of the context switch then includes the cost +of restoring user-level state (cache footprint). The cache footprint +is implemented by having the process allocate an array of data\** +.FS +All arrays are at the same virtual +address in all processes. +.FE +and sum +the array as a series of integers after receiving the token but before +passing the token to the next process. Since most systems will cache data +across context switches, the working set for the benchmark is slightly +larger than the number of processes times the array size. +.LP +It is worthwhile to point out that the overhead mentioned above +also includes the cost of accessing the data, in the same way as +the actual benchmark. However, because the overhead is measured +in a single process, the cost is typically the cost with ``hot'' +caches. In the Figure 2, each size is plotted as a line, with +context switch times on the Y axis, number of processes on the +X axis, and the process size as the data set. +The process size and the hot cache overhead costs for +the pipe read/writes and any data access is what is labeled +as \f(CWsize=0KB overhead=10\fP. The size is in kilobytes and the overhead +is in microseconds. +.LP +The context switch time does not include anything other than +the context switch, provided that all the benchmark processes fit in the +cache. If the total size of all of the benchmark processes is larger +than the cache size, the cost of each context switch will include cache +misses. +We are trying to show realistic context switch times as a +function of both size and number of processes. +.TSTART 1 +.so ctx.pic +.FEND "Context switch times" 1 +.LP +Results for an Intel Pentium Pro system running Linux at 167 MHz are +shown in Figure \n[FIGURE]. +The data points on the figure are labeled with the working set +due to the sum of data in all of the processes. The actual working set is +larger, as it includes the process and kernel overhead as well. +One would expect the context switch times to stay constant until +the working set is +approximately the size of the second level cache. The Intel system has a +256K second level cache, and the context switch times +stay almost constant until about 256K (marked as .25M in the graph). +.BU "Cache issues" +The context switch benchmark is a deliberate measurement of the +effectiveness of the caches across process context switches. If the +cache does not include the process identifier (PID, also sometimes +called an address space identifier) as part of the address, then the +cache must be flushed on every context switch. If the cache does not map +the same virtual addresses from different processes to different cache +lines, then the cache will appear to be flushed on every context +switch. +.LP +If the caches do +not cache across context switches there would be no grouping at the +lower left corner of Figure \n[FIGURE], instead, the graph would +appear as a series of straight, horizontal, parallel lines. The number +of processes will not matter, the two process case will be just as bad +as the twenty process case since the cache would not be +useful across context switches. +.TSTART +.so ctx.tbl +.TEND "Context switch time (microseconds)" +.LP +We picked four points on the graph and extracted those values for Table +\n[TABLE]. The complete set of values, as well as tools to graph them, +are included with \*[lmbench]. +.LP +Note that multiprocessor context switch times are frequently more expensive +than uniprocessor context switch times. This is because multiprocessor +operating systems tend to have very complicated scheduling code. +We believe that multiprocessor context switch times can be, and should be, +within 10% of the uniprocessor times. +.LP +Linux does quite well on context switching, especially on the more +recent architectures. By comparing the Linux 2 0K processes to the +Linux 2 32K processes, it is apparent that there is something wrong +with the Linux/i586 case. If we look back to Table \n[MEMTABLE], we can +find at least part of the cause. The second level cache latency for the +i586 is substantially worse than either the i686 or the Alpha. +.LP +Given the poor second level cache behavior of the PowerPC, it is surprising +that it does so well on context switches, especially the larger sized cases. +.LP +The Sun Ultra1 context switches quite well in part because of enhancements +to the register window handling in SPARC V9. +.NH 2 +Interprocess communication latencies +.LP +Interprocess communication latency is important because many operations +are control messages to another process (frequently on another +system). The time to tell the remote process to +do something is pure overhead and is frequently in the critical path +of important functions such as distributed applications (e.g., +databases, network servers). +.LP +The interprocess communication latency benchmarks typically have the +following form: pass a small message (a byte or so) back and forth between two +processes. The reported results are always the microseconds needed +to do one round trip. For one way timing, +about half the round trip is right. However, the CPU cycles tend to be +somewhat asymmetric for one trip: receiving is typically more +expensive than sending. +.BU "Pipe latency" . +Unix pipes are an interprocess communication mechanism implemented as +a one-way byte stream. Each end of the stream has an associated file +descriptor; one is the write descriptor and the other the read +descriptor. +.LP +Pipes are frequently used as a local IPC mechanism. Because of the +simplicity of pipes, they are frequently the fastest portable +communication mechanism. +.LP +Pipe latency is measured by creating a pair of pipes, forking a child process, +and passing a word back and forth. This benchmark is identical to the +two-process, zero-sized context switch benchmark, except that it includes +both the context switching time and the pipe overhead in the results. +.nr NTABLE \n[TABLE]+1 +.nr LTABLE \n[TABLE] +Table \n[NTABLE] shows the round trip latency from process A to process B +and back to process A. +.TSTART +.so lat_pipe.tbl +.TEND "Pipe latency (microseconds)" +.LP +The time can be broken down to two context switches plus four system calls +plus the pipe overhead. The context switch component is two of the small +processes in Table \n[LTABLE]. +This benchmark is identical to the context switch benchmark in +.RN Ousterhout90 . +.BU "TCP and RPC/TCP latency" . +TCP sockets may be viewed as an interprocess communication mechanism similar +to pipes with the added feature that TCP sockets work across machine +boundaries. +.LP +TCP and RPC/TCP connections are frequently used in low-bandwidth, +latency-sensitive applications. The default Oracle distributed +lock manager uses TCP sockets, and the locks per second available +from this service are accurately modeled by the TCP latency test. +.TSTART +.so lat_tcp.tbl +.TEND "TCP latency (microseconds)" +.LP +Sun's RPC is layered either over TCP or over UDP. +The RPC layer is responsible for managing connections (the port mapper), +managing different byte orders and word sizes (XDR), and implementing a +remote procedure call abstraction. +Table \n[TABLE] shows the same benchmark with and +without the RPC layer to show the cost of the RPC implementation. +.LP +TCP latency is measured by having a server process that waits for connections +and a client process that connects to the server. The two processes then +exchange a word between them in a loop. The latency reported is one +round-trip time. The measurements in Table \n[TABLE] are local +or loopback measurements, +since our intent is to show the overhead of the software. The same benchmark +may be, and frequently is, used to measure host-to-host latency. +.LP +Note that the RPC layer frequently adds hundreds of microseconds of +additional latency. The problem is not the external data +representation (XDR) layer \(em the +data being passed back and forth is a byte, so there is no XDR to be done. +There is no justification for the extra cost; it is simply +an expensive implementation. DCE RPC is worse. +.TSTART +.so lat_udp.tbl +.TEND "UDP latency (microseconds)" +.BU "UDP and RPC/UDP latency" . +UDP sockets are an alternative to TCP sockets. They differ in that UDP +sockets are unreliable messages that leave the retransmission issues to +the application. UDP sockets have a few advantages, however. They preserve +message boundaries, whereas TCP does not; and a single UDP socket may +send messages +to any number of other sockets, whereas TCP sends data to only one place. +.LP +UDP and RPC/UDP messages are commonly used in many client/server applications. +NFS is probably the most widely used RPC/UDP application in the world. +.LP +Like TCP latency, UDP latency is measured by having a server process +that waits for connections +and a client process that connects to the server. The two processes then +exchange a word between them in a loop. The latency reported is round-trip +time. The measurements in Table \n[TABLE] are local or loopback measurements, +since our intent is to show the overhead of the software. +Again, note that the RPC library can add hundreds of microseconds of extra +latency. +.\" .LP +.\" It is interesting to compare UDP latency with TCP latency. In many cases the +.\" TCP latency is \fBless\fP than the UDP latency. This flies in the face +.\" of conventional wisdom, which says that TCP is an inherently more expensive +.\" protocol than UDP. The reasons that TCP may appear faster are: in this +.\" benchmark, the protocol costs are dwarfed by the other costs (context +.\" switching, system calls, and driver overhead); and TCP is frequently +.\" hand-tuned for performance, while UDP is rarely hand-tuned. +.TSTART +.so lat_ipc.tbl +.TEND "Remote latencies (microseconds)" +.BU "Network latency" . +We have a few results for over the wire latency included in Table \n[TABLE]. +As might be expected, the most heavily used network interfaces (i.e., ethernet) +have the lowest latencies. The times shown include the time on the wire, +which is about 130 microseconds for 10Mbit ethernet, 13 microseconds for 100Mbit +ethernet and FDDI, and less than 10 microseconds for Hippi. +.BU "TCP connection latency" . +TCP is a connection-based, reliable, byte-stream-oriented protocol. As +part of this reliability, a connection must be established before any +data can be transferred. The connection is accomplished by a ``three-way +handshake,'' an exchange of packets when the client attempts to connect +to the server. +.LP +Unlike UDP, where no connection is established, TCP sends packets +at startup time. If an application creates a TCP connection to send +one message, then the startup time can be a substantial +fraction of the total connection and transfer costs. +The benchmark shows that the connection cost is approximately half of +the cost. +.LP +Connection cost is measured by having a server, registered using +the port mapper, waiting for connections. The client figures out where the +server is registered and then repeatedly times a \*[connect] system call to +the server. The socket is closed after each connect. Twenty connects +are completed and the fastest of them is used as the result. The time measured +will include two of the three packets that make up the three way TCP handshake, +so the cost is actually greater than the times listed. +.\" XXX Larry --- if a machine's clock granularity is on the order of +.\" 10 milliseconds, won't this benchmark run into granularity problems? +.TSTART +.so lat_connect.tbl +.TEND "TCP connect latency (microseconds)" +.LP +Table \n[TABLE] shows that if the need is to send +a quick message to another process, given that most packets get through, +a UDP message will cost a \f(CWsend\fP and a \f(CWreply\fP (if positive +acknowledgments are needed, which they are in order to have an apples-to-apples +comparison with TCP). If the transmission medium is 10Mbit Ethernet, the +time on the wire will be approximately 65 microseconds each way, or 130 +microseconds total. To do the same thing with a short-lived TCP +connection would cost 896 microseconds of wire time alone. +.LP +The comparison is not meant to disparage TCP; TCP is a useful protocol. Nor +is the point to suggest that all messages should be UDP. In many cases, +the difference between 130 microseconds and 900 microseconds is +insignificant compared with other aspects of application performance. +However, if the application is very latency sensitive +and the transmission medium is slow (such as serial link or a message +through many routers), then a UDP message may prove cheaper. +.NH 2 +File system latency +.LP +File system latency is defined as the time required to create or delete +a zero length file. +We define it this way because in many file systems, +such as the BSD fast file system, the directory operations are done +synchronously in order to maintain on-disk integrity. Since the +file data is typically cached and sent to disk at some later date, +the file creation and deletion become the bottleneck +seen by an application. This bottleneck is substantial: to do +a synchronous update to a disk is a matter of tens of milliseconds. +In many cases, this bottleneck is much more of a perceived performance +issue than processor speed. +.LP +The benchmark creates 1,000 zero-sized files and then deletes them. +All the files are created in one directory and their names are +short, such as "a", "b", "c", ... "aa", "ab", .... +.TSTART +.so lat_fs.tbl +.TEND "File system latency (microseconds)" +.LP +The create and delete latencies are shown in Table \n[TABLE]. +Notice that Linux does extremely well here, 2 to 3 orders of magnitude faster +than the slowest systems. However, Linux does not guarantee +anything about the disk integrity; the directory operations are done in +memory. Other fast systems, such as SGI's XFS, use a log to guarantee the +file system integrity. +The slower systems, all those with ~10 millisecond file latencies, are +using synchronous writes to guarantee the file system integrity. +Unless Unixware has modified UFS substantially, they must be running in +an unsafe mode since the FreeBSD UFS is much slower and both file +systems are basically the 4BSD fast file system. +.NH 2 +Disk latency +.\" XXX - either get more results for this benchmark or delete it. +.\" I'd really like to not delete it - lmdd is probably the most +.\" useful tool and it gets the least press. +.LP +Included with \*[lmbench] is a small benchmarking program useful for +measuring disk and file I/O. \*[lmdd], which is patterned after +the Unix utility \f(CWdd\fP, measures both sequential and random I/O, +optionally generates patterns on output and checks them on input, +supports flushing the data from the buffer cache on systems that +support \f(CWmsync\fP, and has a very flexible user interface. +Many I/O benchmarks can be trivially replaced with a \f(CWperl\fP script +wrapped around \*[lmdd]. +.LP +While we could have generated both sequential and random I/O results as +part of this paper, we did not because those benchmarks are heavily +influenced by the performance of the disk drives used in the test. We +intentionally measure only the system overhead of a SCSI command since +that overhead may become a bottleneck in large database configurations. +.LP +Some important applications, such as transaction processing, are +limited by random disk IO latency. +Administrators can increase the number of disk operations per +second by buying more disks, until the processor overhead becomes +the bottleneck. +The \*[lmdd] benchmark measures the processor overhead associated with each +disk operation, and it can provide an upper bound on the number of +disk operations the processor can support. +It is designed for SCSI disks, and it assumes that most +disks have 32-128K read-ahead buffers and that they can read ahead +faster than the processor can request the chunks of data.\** +.FS +This may not always be true: a processor could be fast enough to make the +requests faster than the rotating disk. +If we take 6M/second to be disk +speed, and divide that by 512 (the minimum transfer size), that is 12,288 IOs/second, or +81 microseconds/IO. We don't know of any processor/OS/IO controller +combinations that can do an IO in 81 microseconds. +.FE +.LP +The benchmark simulates a large number of disks by reading 512byte +transfers sequentially from the raw disk device (raw disks are unbuffered +and are not read ahead by Unix). +Since the disk can read ahead faster than the system can request +data, the benchmark is doing small transfers of data from the +disk's track buffer. +Another way to look at this is that the benchmark +is doing memory-to-memory transfers across a SCSI channel. +It is possible to generate loads of more than 1,000 SCSI +operations/second on a single SCSI disk. For comparison, disks under +database load typically run at 20-80 operations per second. +.TSTART +.so lat_disk.tbl +.TEND "SCSI I/O overhead (microseconds)" +.LP +The resulting overhead number represents a +\fBlower\fP bound on the overhead of a disk I/O. +The real overhead numbers will be higher on SCSI systems because +most SCSI controllers will not disconnect if the request can be +satisfied immediately. +During the benchmark, the processor simply sends the request and +transfers the data, while +during normal operation, the processor will send the request, +disconnect, get interrupted, reconnect, and transfer the data. +.LP +This technique can be used to discover how many drives a system can support +before the system becomes CPU-limited because it can produce the +overhead load of a fully configured system with just a few disks. +.NH 2 +Parallelism +.LP +description of parallelism benchmarks with sample results. +.NH 2 +Other benchmarks +.LP +description of other benchmarks with sample results. +.NH 1 +Scaling Benchmarks +.LP +There are a number of issues associated with converting +single-process benchmarks with a single process to +scalable benchmarks with several independent processes, +in addition to the various issues addressed by +the timing harness. +Many of the benchmarks consume or utilize system +resources, such as memory or network bandwidth, +and a careful assessment of the likely resource +contention issues is necessary to ensure that the +benchmarks measure important aspects of system performance +and not artifacts of artificial resource contention. +.LP +For example, the Linux 2.2 kernel uses a single lock to +control access to the kernel data structures for a file. +This means that multiple processes accessing that file +will have their operations serialized by that lock. +.NH 2 +File System +.LP +A number of the benchmarks measure aspects of file system +performance, such as \*[bw_file_rd], \*[bw_mmap_rd], +\*[lat_mmap], and \*[lat_pagefault]. +It is not immediately apparent how these benchmarks should +be extended to the parallel domain. For example, it may +be important to know how file system performance scales +when multiple processes are reading the same file, or +when multiple processes are reading different files. +The first case might be important for large, distributed +scientific calculations, while the second might be more +important for a web server. +.LP +However, for the operating system, the two cases are +significantly different. When multiple processes +access the same file, access to the kernel data +structures for that file must be coordinated and +so contention and locking of those structures can +impact performance, while this is less true when +multiple processes access different files. +.LP +In addition, there are any number of issues associated +with ensuring that the benchmarks are either measuring +operating system overhead (e.g., that no I/O is actually +done to disk), or actually measuring the system's I/O +performance (e.g., that the data cannot be resident in +the buffer cache). Especially with file system related +benchmarks, it is very easy to develop benchmarks that +compare apples and oranges (e.g., the benchmark includes +the time to flush data to disk on one system, but only +includes the time to flush a portion of data to disk on +another system). +.LP +\*[lmbench3] allows the user to measure either case +as controlled by a command-line switch. When measuring +accesses to independent files, the benchmarks first +create their own private copies of the file, one for +each child process. Then each process accesses its +private file. When measuring accesses to a single +file, each child simply uses the designated file +directly. +.NH 2 +Context Switching +.LP +Measuring context switching accurately is a difficult +task. \*[lmbench1] and \*[lmbench2] measured context +switch times via a "hot-potato" approach using pipes +connected in a ring. However, this experimental +design heavily favors schedulers that do "hand-off" +scheduling, since at most one process is active at +a time. +Consequently, it is not really a good benchmark +for measuring scheduler overhead in multi-processor +machines. +.LP +The design and methodology for measuring context +switching and scheduler overhead need to be revisited +so that it can more accurately measure performance +for multi-processor machines. +.NH 1 +New Benchmarks +.LP +\*[lmbench3] also includes a number of +new benchmarks. +.NH 2 +Stream +.LP +\*[lmbench3] includes a new micro-benchmark which +measures the performance of John McCalpin's \*[stream] +benchmark kernels for \*[stream] versions 1 and 2. +This benchmark faithfully recreates each of the +kernel operations from both \*[stream] benchmarks, +and because of the powerful new timing harness it +can easily measure memory system scalability. +.TSTART 1 +.TS +center box tab (|); +c s s s s +c | c | c s | c +l | l | l | l | l . +Stream +_ +Kernel|Code|Bytes|FL +||rd|wr|OPS +_ +COPY|$a[i]=b[i]$|8(+8)|8|0 +SCALE|$a[i]=q times b[i]$|8(+8)|8|1 +ADD|$a[i]=b[i]+c[i]$|16(+8)|8|1 +TRIAD|$a[i]=b[i]+q times c[i]$|16(+8)|8|2 +.TE +.TS +center box tab (|); +c s s s s +c | c | c s | c +l | l | l | l | l . +Stream2 +_ +Kernel|Code|Bytes|FL +||rd|wr|OPS +_ +FILL|$a[i]=q$|0(+8)|8|0 +COPY|$a[i]=b[i]$|8(+8)|8|0 +DAXPY|$a[i]=a[i]+q times b[i]$|16|8|2 +SUM|$sum=sum + a[i]$|8|0|1 +.TE +.TEND "Stream operations" +.LP +Table \n[TABLE] shows the four kernels for each version +of the \*[stream] benchmark. Note that the +.I read +columns include numbers in parenthesis, which +represent the average number of bytes read into +the cache as a result of the write to that +variable. Cache lines are almost invariably +bigger than a single double, and so when a +write miss occurs the cache will read the line +from memory and then modify the selected bytes. +Sometimes vector instructions such as SSE +and 3DNow can avoid this load by writing an +entire cache line at once. +.NH 2 +Basic operation latency +.LP +\*[lmbench3] includes a new micro-benchmark +which measures the latency for a variety of basic +operations, such as addition, multiplication, and +division of integer, float, and double operands. +To measure the basic operation latency we construct +a basic arithmetic statement containing the desired +operands and operations. This statement is repeated +one hundred times and these repetitions are then +embedded in a loop. +.TSTART +.TS +center box tab (&); +c c c +l & l & l . +Operand&Operation&Statement +_ +int&$bit$&r^=i;s^=r;r|=s; +&$add$&a+=b;b-=a; +&$mul$&r=(r*i)^r; +&$div$&r=(r/i)^r; +&$mod$&r=(r%i)^r; +_ +float&$add$&f+=f; +&$mul$&f*=f; +&$div$&f=g/f; +_ +double&$add$&f+=f; +&$mul$&f*=f; +&$div$&f=g/f; +.TE +.TEND "lat_ops statements" +.LP +Table \n[TABLE] shows the data type and expressions +used for each basic operation type. The variable +$i$ indicates the integer loop variable and generally +changes every ten or hundred evaluations of the +basic expression. All other variables are of +the basic type being measured, and aside from +being modified by the relevant expressions are +only initialized once at the beginning of the +benchmark routine. +.LP +Each statement has been designed to ensure that +the statement instances are \fIinterlocked\fR, +namely that the processor cannot begin processing +the next instance of the statement until it has +completed processing the previous instance. This +property is crucial to the correct measurement of +operation latency. +.LP +One important consideration in the design of +the statements was that they not be optimized +out of the loop by intelligent compilers. +Since the statements are repeated one hundred +times, the compiler has the option of evaluating +the sequence of one hundred repetitions of the +same statement, and sometimes it can find +optimizations that are not immediately +apparent. For example, the integer statement +$a=a+a;$ when repeated one hundred times in +a loop can be replaced with the single statement +$a=0;$ because the statement $a=a+a;$ is equivalent +to $a< < =1;$, and one hundred repetitions of that +statement is equivalent to $a< < =100;$, which for +32bit (or even 64bit) integers is equivalent to +$a=0;$. +.LP +It is relatively easy to identify floating +point statements that interlock, are not +optimized away, and that only use the operation +of interest. +It is much harder to identify integer statements +meeting the same criterion. All simple +integer bitwise operations can either be optimized +away, don't interlock, or use operations other +than one of interest. +We chose to add operations other than the +operation(s) of interest to the statements. +.LP +The integer $mul$, $div$, and $mod$ statements all +include an added $xor$ operation which prevents +(current) compilers from optimizing the statements +away. Since the $xor$ operation is generally +completed in a single clock tick, and since +we can measure the $xor$ operation latency +separately and subtract that overhead, we can +still measure the latencies of the other +operations of interest. +.LP +It is not possible to measure latency for 64bit +operations on 32bit machines because most +implementations allow operations on the upper +and lower bits to overlap. This means that +on most 32bit machines, the measured latency +would appear to be a non-integral multiple of +the basic clock cycle. For example, in the +$add$ statement, the system could first add +the two lower words. Then, in parallel it +could both add the two upper words (along with +the carry from the lower words), and compute +the $xor$ of the lower word. Finally, it +can overlap the $xor$ of the upper word +with the addition of the two lower words from +the next instantiation of the statement. +.TSTART +.TS +center box tab (&); +c c c c c +c c c c c +l & l & r & r & r . +Operand&Op&HPPA2.0&PIII&AMD +&&400MHz&667MHz&1.3GHz +_ +mhz&&2.50&1.50&0.75 +int&$bit$&2.53&1.50&0.75 +&$add$&2.50&1.51&0.75 +&$mul$&14.52&6.07&3.03 +&$div$&109.40&58.52&30.86 +&$mod$&75.14&65.01&32.59 +_ +float&$add$&7.54&4.58&3.0 +&$mul$&7.50&7.50&3.0 +&$div$&45.00&35.26&13.21 +_ +double&$add$&7.52&4.53&3.01 +&$mul$&7.52&7.71&3.01 +&$div$&85.01&35.51&13.16 +.TE +.TEND "lat_ops results (ns)" +.LP +Table \n[TABLE] contains some sample results +for two processors. +It does contain one result which is slightly +surprising unless you are familiar with the +PA-RISC architecture: floating point multiply +and divide are faster than the corresponding +integer operations! This is because PA-RISC +does not contain integer MUL, DIV, or MOD +instructions and the optimizing compiler +converts the integers into floating point, +does the operations in the floating point +unit, and then converts the result back +to an integer. +.NH 2 +Basic operation parallelism +.LP +Instruction-level parallelism in commodity processors +has become commonplace in the last ten years. +Modern processors typically have more than one +operational unit that can be active during a +given clock cycle, such as an integer arithmetic +unit and a floating point unit. In addition, +processors may have more than a single instance +of a given type of operational unit, both of +which may be active at a given time. All this +intra-processor parallelism is used to try and +reduce the average number of clock cycles per +executed instruction. +.LP +\*[lmbench3] incorporates a new benchmark \*[par_ops] +which attempts to quantify the level of available +instruction-level parallelism provided by the processor. This +benchmark is very similar to \*[lat_ops], and +in fact uses the same statement kernels, but it +has been modified and extended. We create +different versions of each benchmark; each +version has $N$ sets of interleaved statements. +Each set is identical to equivalent \*[lat_ops] +statements. In this way multiple independent +sets can be executing the same operation(s) +in parallel, if the hardware supports it. +.LP +For example, the float $mul$ benchmark to measure +performance with two parallel streams of statements +would look like something this: +.DS +#define TEN(a) a a a a a a a a a a +void benchmark_1(iter_t iterations, void* cookie) +{ + register iter_t i = iterations; + struct _state* state = (struct _state*)cookie; + register float f0 = state->float_data[0]; + register float f1 = state->float_data[1]; + + while (i-- > 0) { + TEN(f0*=f0; f1*=f1;) + } + use_int((int)f0); + use_int((int)f1); +} +.DE +.LP +If the processor had two floating point multiply +units, then both $f0$ and $f1$ multiplies could +proceed in parallel. +.LP +However, there are some potential problems with +the integer operations, namely the fact that the +statements contain mixed operations. In general, +processors have at least as many integer units +that can do $xor$ as can do the other operations +of interest ($mul$, $div$ and $mod$), so the +inclusion of $xor$ in the statements shouldn't +be a bottleneck. +.LP +However, since parallelism is measured by comparing +the latency of the single-stream with that of +multiple interleaved streams, and since the single-stream +latency includes the $xor$ latency, the apparent +parallelism of $mul$, $div$, $mod$ can be over-stated. +For example, if a process has one unit that can +do integer bit operations, such as $xor$, and another +unit for integer $mul$ operations, then the average +latency for $a0 = (i * a0) ^ a0$ in the single stream +case would be: +.EQ +t bar = t sub xor + t sub mul +.EN +In the multi-stream case, the execution of the $xor$ +operation of one stream can be overlapped with the +$mul$ of another stream, so the average latency per +stream would simply be $t bar = t sub mul$, assuming +that $mul$ operations are not cheaper than $xor$ +operations, which results in an apparent parallelism +$p tilde$: +.EQ +p tilde = {t sub xor + t sub mul} over { t sub mul } +.EN +Assuming that $t sub xor < < t sub mul$, this +still gives a reasonable approximation to +the correct answer. Unfortunately, this is +not always a reasonable assumption. +.LP +Of course, if it was known ahead of time that +$xor$ and { $mul$, $div$, and $mod$ } used +different execution units, then the benchmark +could simply subtract $t sub xor$ from the +baseline measurement. The difficulty lies +in determining whether the units overlap +or not. +.TSTART +.TS +center box tab (&); +c c c c c +c c c c c +l & l & r & r & r . +Operand&Op&HPPA2.0&PIII&AMD +&&400MHz&667MHz&1.3GHz +_ +int&$bit$&1.99&1.70&1.87 +&$add$&1.99&1.61&1.90 +&$mul$&6.64&3.81&2.00 +&$div$&2.81&1.20&1.00 +&$mod$&2.78&1.11&1.03 +_ +float&$add$&5.88&1.00&2.66 +&$mul$&5.86&1.14&2.47 +&$div$&2.12&1.03&1.14 +_ +double&$add$&5.68&1.08&2.49 +&$mul$&5.58&1.00&2.53 +&$div$&2.19&1.03&1.14 +.TE +.TEND "par_ops results" +.LP +.NH 1 +Results +.LP +Some sample results +.LP +bw_mem_rd performance vs. scaling on an SMP machine +.LP + +.NH 1 +Unscalable benchmarks +.LP +There are a number of benchmarks which either +did not make sense for scalable load, such as +\*[mhz], or which could not +be extended to measure scalable load due to +other constraints, such as \*[lat_connect]. +.LP +\*[mhz] measures the processor clock speed, +which is not a scalable feature of the system, +so it doesn't make any sense to create a +version of it that measures scalable performance. +.LP +More specifically, \*[lat_connect] measures +the latency of connecting to a TCP socket. +TCP implementations have a timeout on +sockets and there is generally a fixed size +queue for sockets in the TIMEOUT state. +This means that once the queue has been +filled by a program connecting and closing +sockets as fast as possible, then all new +socket connections have to wait TIMEOUT +seconds. Needless to say, this gives no +insight into the latency of socket creation +per se, but is rather a boring artifact. +Since the \*[lmbench2] version of the +benchmark can run for very short periods +of time, it generally does not run into +this problem and is able to correctly +measure TCP connection latency. +.LP +Any scalable version of the benchmark needs +each copy to run for at least a second, and +there are $N$ copies creating connections as +fast as possible, so it would essentially be +guaranteed to run into the TIMEOUT problem. +Consequently, \*[lat_connect] was not +enhanced to measure scalable performance. +.NH 1 +A brief tutorial on memory design +.LP +Nearly all modern, general purpose computers use +virtual memory with phyically addressed caches. +As such, there is typically one or more caches +between the physical memory and the processor, +and virtual-to-physical address translation +occurs between the processor and the top-level +cache. Cache staging and replacement is done +in \fIcache line\fR units, which are typically +several words in length, and caches lower in +the hierarchy sometimes have cache lines which +are larger than those in the higher caches. +.LP +Modern processors usually incorporate at least +an L1 cache on-chip, and some are starting to +also incorporate the L2 cache on-chip. In +addition, most include a translation look-aside +buffer (TLB) on-chip for fast virtual-to-physical +address translation. +.LP +One key element of any cache design is its +replacement strategy. Most caches use either +direct-mapped or set associative caches. In +the first instance any word in physical memory +has exactly one cache line where into which it +may be staged, while set associative caches +allow a given word to be cached into one of a +set of lines. Direct-mapped caches have a +very simple replacement policy: the contents +of the line that is needed is discarded. +Set associative caches usually use LRU or +some variant within each set, so the least +recently used line in the set of possible +cache lines is replaced. The control logic +for direct-mapped caches is much cheaper to +build, but they are generally only as +effective as a set-associative cache half +the size.\** +.FS +See +.RN Hennessy96 +page 396. +.FE +.LP +Another key element of memory hierarchy design +is the management of dirty data; at what point +are writes passed down the memory hierarchy to +lower caches and main memory? The two basic +policies are write-through and write-back. +A write-through policy means that writes are +immediately passed through the cache to the +next level in the hierarchy, so the lower +levels are updated at the same time as the +cache. A write-back policy means that the +cache line is marked as dirty in the cache, +and only when the line is ejected from the +cache is the data passed down the hierarchy. +Write-through policies are often used in +higher (smaller) caches because multi- +processor systems need to keep a coherent +view of memory and the writes are often +propagated to other processors by \fIsnoopy\fR +caches. +.LP +One often overlooked aspect of cache +performance is cache behavior during +writes. Most cache lines contain +several words, and most instructions +only update the line a word at a time. +This means that when the processor +writes a word to a cache line that is +not present, the cache will read the +line from memory before completing the +write operation. For \*[bcopy]-like +operations this means that the overall +memory bandwidth requirement is actually +two reads and one write per copied word, +rather than the expected read and write. +.LP +Most modern processors now include some form +of prefetch in the memory hierarchy. For +the most part these are simple systems that +can recognize fixed strided accesses through +memory, such as might be seen in many array +operations. However, prefetching systems +appear to be growing in complexity and +capability. +.LP +Additionally, modern memory subsystems can +usually support multiple outstanding requests; +the level of parallelism is usually dependent +on the level of the hierarchy being accessed. +Top-level caches can sometimes support as +many as six or eight outstanding requests, +while main memory can usually support two +outstanding requests. Other elements of +the memory hierarchy, such as the TLB, often +have additional limits on the level of +achievable parallelism in practice.\** +.FS +For example, if the TLB serializes all +TLB misses, and if each memory access +causes a TLB miss, then the memory +accesses will be serialized even if +the data was in a cache supporting +six outstanding requests. +.FE +.LP +For more information and details on memory +subsystem design, and computer architecture +in general, please see +.RN Hennessy96 +which has an excellent description of these +and many other issues. +.NH 1 +Memory analysis +.LP +There are a variety of aspects of memory hierarchy design +that are interesting to a software developer, such as +the number of caches and their sizes. In addition, other +aspects of cache design, such as the line size, +associativity and parallelism can impact software +performance and are of potential interest to software +developers. +.LP +The problem is designing a portable ANSI-C program to +infer the cache parameters. A number of operating +systems have hooks to report at least certain aspects +of cache and memory hierarchy design, but any program +utilizing those hooks would not be fully portable +across hardware and operating system platforms. +.LP +The key observation is that caches help reduce memory +latency. In a perfect world, all possible data would +fit in the cache, so a graph of average memory latency +versus amount of memory utilized would look like a +series of plateaus separated by cliffs. The cliff +edges would be located at the cache boundaries and +the plateau height would be the average memory latency. +.LP +The first problem is that one needs a mechanism for +accurately measuring time in a portable fashion. +\*[lmbench2] introduced a new timing harness +that determines the minimum duration of a timing interval +for \*[gettimeofday] to provide accurate measurements +.RN Staelin98 . +.LP +\*[lmbench] includes a benchmark that measures +average memory latency, \*[lat_mem_rd] +.RN McVoy96 . +It creates a pointer chain, and then measures the +average time to dereference the pointers. +\*[lat_mem_rd] creates the pointer chain by simply +striding through memory at fixed intervals, e.g. +every other word. +.LP +\*[lmbench2] extended \*[lat_mem_rd] so +that each timing interval only accessed memory +as many times as necessary to consume a timing +interval. When accessing cache this often means +that the whole pointer chain will be accessed +at least once during the timing interval, but +when accessing memory this often means that only +a portion of the chain will be accessed during +any given timing interval. +.LP +While this approach gives very useful insights +into memory hierarchy performance, it is not +quite sufficient to determine the various +characteristics of the memory hierarchy. +.LP +The first problem is that unless the stride is +exactly the same size as the cache size, then +there will either be multiple successive accesses +to the same line, or some fraction of data +will be completely skipped. In the first case +the observed latency is much faster than the +true latency because it is the average of a +single miss latency (slow) with one or more +hit latencies (fast). In the second case, the +amount of data actually loaded into the cache +may be a small fraction of the expected amount +so the data may fit into a smaller (faster) +cache. +The second problem is that this sequence is +highly predictable, even by simple-minded +prefetching policies, so accurate prefetching +might be masking the true memory latencies. +.LP +This method does do a few things properly. +First of all, accesses to a single page are +clustered together so the TLB miss cost (if +any) is amortized over as many accesses as +possible. Secondly, assuming the pointer +chain is laid out unpredictably, the memory +subsystem must wait for the previous load +to complete before it can initiate the +next load, so we can measure the true latency. +.NH 2 +Prefetching +.LP +Some memory subsystems have been highly optimized to +recognize and automatically prefetch memory when +given "predictable" memory access streams, such as +when striding through array accesses. This means that +the memory access stream generated by \*[lmbench] +must be unpredictable by the standard prediction +algorithms. +.LP +The original \*[lmbench] memory latency benchmark, +lat_mem_rd, built a chain of pointers that would +stride backwards through memory. This was able to +defeat many simple prefetching algorithms of the +time, but some systems came to incorporate prefetching +algorithms that recognized strided accesses in +both directions. +.LP +The obvious method for producing an unpredictable +chain of line references is to use a random +permutation of line indexes. +.LP +\*[lmbench] uses a deterministic algorithm to compute +the reference chain which guarantees that references +are as far away from previous accesses in both time +and space as possible. Basically, the binary bits +representing the line index are reversed, so that +1101 becomes 1011, or 001 becomes 100. This only +works if the number of cache lines is an even power +of two, but since page sizes and line sizes are +always powers of two, this assumption is valid.\** +.FS +At least this is the case in every modern system known +to the author. +.FE +.LP +Additionally, since higher-level caches can have +smaller line sizes than lower-level caches, it +is necessary to access every word in the relevant +chunk of memory. However, accesses to words in +the same line must be separated in time by accesses +to the rest of the memory. This is achieved by +identifying the line size for the largest cache, +and then setting up the chain so that there is +one pass through the memory for each word in the +line with the sequence of words being determined +by the bit-reversal method described above. +.LP +For example, suppose a system has 4KB pages, the +largest cache has a line size of 64bytes, and a +word is 4bytes. Then each page would have 64 lines, +and each line would have 16 words. The system +would setup a pointer chain that visits each line +on each page using the zeroth word; at the end of +the chain it would then jump to the start of the +pages and visit each line on each page using the +eigth word, and so forth until each word had been +visited. +.NH 2 +Dirty data +.LP +An additional issue that we need to take into +account is the cache's policy for dirty data. +Many caches use a copy-back policy, while others +use a write-through policy. +.LP +Different caches on the same machine may use +different policies. Also, cache performance +can be affected by the presence of dirty data. +For example, suppose both the L1 and L2 caches +use a copy-back policy, and suppose that the +access time for reading data located in L2 +depends on whether the data being ejected from +L1 is dirty and needs to be copied back from L1 +to L2 before the read from L2 to L1. +In this case, a benchmark which writes a pointer +chain that fits in L2 but is larger than L1, +and then measures the time to follow the chain, +will get a different average memory latency than +a benchmark which writes the same chain and +reads enough data to flush the L2 cache before +measuring the time to follow the chain. +In the first case, each application read will +result in a write from L1 to L2 followed by +a read from L2 to L1, while in the second +case each application read will only result +in a read from L2 to L1. +.LP +Since it is possible that average memory latencies +for a read-only access stream may be increased if +any of the data in the cache is dirty, we need to +flush the cache after setting up the pointer +chains and before we do any measurements. +Otherwise, when we access a pointer chain that +is larger than the L1 cache but smaller than the +largest cache, dirty data can reside in the lowest +(largest) cache and as each line is staged from +the largest cache to the L1 cache, it is marked +as dirty in the L1 cache. Then when each dirty +line is flushed from the L1 cache (to the L2 +cache), the system has to write the data back to +L2, which delays the load of the next (dirty) +line from L2 to L1. +.LP +To flush the cache we read (and sum) a large +amount of memory, which should be several times +larger than the largest cache. In this way, +all dirty data in the cache should be flushed +from the cache without creating additional +dirty data. +.NH 2 +Page mapping +.LP +Complicating the issue still further is the fact that +caches do not use full LRU replacement policies. Nearly +all caches use some form of set associativity, where +pages are directed to a pool of cache lines based on +the physical address. Replacement within the pool is +typically LRU. Direct-mapped caches are a special case +where the pool size is a single line. +.LP +Additionally, some systems use victim caches, which are +typically small caches which caches recently discarded +cache lines. Victim caches can be particularly effective +for direct-mapped caches by reducing the cache miss +rate caused by colliding hot spots. +.LP +However, page mapping and its attendant cache collisions +is under the control of the kernel, and is in fact +invisible to user-land programs. Some operating +systems make an effort to minimize possible page collisions +when giving memory to processes\**, while other operating +systems appear to simply grab the first available pages, +regardless of potential cache collision effects. +.FS +This is generally known as "page coloring", and is much +more important on systems with direct-mapped caches than +those with N-way set associative caches. +.FE +.LP +Factoring out page placement affects on average memory +latency is very difficult, but it is necessary to +ensure that the correct cache size is identified. +.NH 1 +Cache line size +.LP +The first feature of the memory hierarchy we +will try to analyze is the cache line size, +since we can find the line size for the +largest cache without any other knowledge of +the system, and since determining nearly all +other aspects of the memory subsystem either +require or are greatly simplified by knowing +the cache line size. +.LP +The most obvious aspect of cache design is that replacement +is done on a per-line basis, and cache lines often contain +several words of data (32-128bytes per line is common). +However, it is necessary to ensure that we don't +generate "spurious" cache hits by referencing a word from +a cache line that was recently accessed. We must ensure +that each line is only re-referenced after all other +memory in the buffer has been referenced. +.LP +Unfortunately, we usually do not know the cache line size +ahead of time. In addition, sometimes systems contain +several caches, and each cache can use a different line +size! Usually line sizes are powers of two, and usually +the smaller (higher) caches have line sizes which are the +same or smaller than the larger (lower) caches. However, +we still need to ensure that we access all cache lines +for all caches without generating the spurious cache hits. +.LP +Determining the cache line size requires a series of +experiments. The basic observation is that when the +amount of memory being accessed is larger than the +cache, and when the access chain is arranged properly, +then each memory reference causes a cache miss. If +however, a word on a recently access line is requested, +then that reference will be a cache hit. More +completely, the average memory access time $t bar$ +is: +.EQ +t bar = t sub miss + ( n - 1 ) t sub hit +.EN +expressed as a function of $n$, the number of accesses +to the cache line, $t sub miss$, the cache miss latency, +and $t sub hit$, the cache hit latency. +.TSTART +.G1 +.so memhier-line.d +.G2 +.FEND "Line Size" +.LP +We can determine the cache line size by measuring +the average memory access latency over a series of +memory access patterns: accessing every word, every +other word, every fourth word, every eigth word, ... +While the system is accessing multiple words per +cache line, the average memory latency will be +smaller than the cache miss latency, and as the +space between accesses increases, the average +memory increase will grow. +When the system accesses only one word per line, +the average memory latency will remain level even +as the spacing between accesses increases. +.LP +It is possible to utilize this behavior to identify +the cache line size. The algorithm is to measure +the average memory latency when each word is +accessed. Then as you increase the space between +accessed words (doubling the space each iteration), +you look for a situation where the average latency +increased dramatically, say greater than 30%, +followed by a levelling off on the next iteration, +say an increase less than 15%. The line size is +the last point where the average latency jumped +dramatically. +.NH 1 +TLB +.LP +Measuring the TLB-miss costs assumes that one can isolate +those costs from the rest of the memory access costs. The +key observation is that it is often possible to create a +situation in which all data being accessed resides in the +cache, and yet it requires a TLB-miss to be able to locate +it. +.LP +This program identifies the effective TLB size, rather +than the true TLB size. First of all, from a programmer's +point of view, it is really the effective TLB size that +impacts program performance. Secondly, there is no way +for a user-land program to measure true TLB size because +kernels sometimes pin some kernel page mappings into the +TLB and because some hardware/OS combinations +support "super-pages", or multi-page mappings. +.LP +We create two similar pointer chains with identical length +and which reference an identical amount of memory, with one +key difference. In the first chain, the data is packed +tightly into as few pages as possible, and references +remain within a single page as long as possible. The +second chain spreads the data over as many pages as +possible and jumps between pages at each reference. +The two chains are arranged so that the same amount of +data will fit into the cache, so that the raw memory +access time for each chain is identical, within +experimental constraints. The sole difference between +average access costs should be the TLB-lookup times. +.LP +When the pages from the second chain fit into the TLB, +the average access times for the two chains should be +identical. However, as soon as the number of pages in +the second chain exceeds the TLB size, the second +chain will start to pay TLB-miss costs. Depending on +the TLB replacement policy, the fraction of requests +generating TLB-misses in the second chain can vary +dramatically\**. +.FS +Pure LRU would ensure that as soon as the chain was one +page longer than the TLB size, every access would trigger +a TLB-miss. However, other replacement algorithms might +result in as few as $"number of pages" - "TLB size" + 1$ +misses per iteration over the loop. +.FE +.TSTART +.G1 +.so memhier-tlb.d +.G2 +.FEND "TLB" +.LP +The system must search for the point at which the +average memory latency of the second chain diverges +from the average latency of the first chain. Since +most systems have relatively small TLBs and since +checking TLB sizes smaller than the effective TLB +size is faster than checking TLB sizes larger than +the TLB, the system starts with the guess of eight +pages to establish a baseline. It then iteratively +doubles the number of pages until either a maximum +limit has been reached or the average TLB-miss cost +is greater than 15% of the average memory latency. +Once it discovers the upper bound on the possible +TLB size, it uses a binary search between the last +two TLB size guesses to find the point at which +the average latency for the two streams diverge. +.NH 1 +Cache size +.LP +For the purpose of identifying the cache size, the +ideal situation is that as long as the amount of +memory is equal to or less than the cache size, then +all the data is in the cache and the average memory +latency is the cache hit latency. As soon as the +memory doesn't fit in cache, then none of it should +be in the cache, so the average memory latency is +the cache miss latency.\** When examining average +memory latency versus memory size, this would give +nice flat plateaus for each cache, with nice sharp +transitions from one cache to the next, and from the +largest cache to main memory. +.FS +Of course, for real programs, you want the average +memory latency to be as low as possible, which means +that you want as much of the data in cache as possible. +.FE +.LP +However, the realities are that real data from real +systems is corrupted in a variety of ways. +First of all, even when the memory can fit into the +cache, pages often collide in the cache and the +fraction of pages that have collisions often +increases as the amount of memory nears the cache size. +Secondly, even when the memory cannot fit into the +cache, there can be pages that do not collide. +Finally, there is simple experimental noise, which is +usually limited to 1% or less. +.LP +The result of the first two problems is that on +some systems, the average memory latency increases +gradually as the memory size is increased. There +are no flat plateaus and sharp cliffs which make +it easy to identify the number, size, and +performance of the caches. +.NH 2 +Page coloring +.LP +The first problem is to create a set of pages +which do not collide in the cache. +The solution is to allocate more memory +than necessary, and to try different combinations +of pages to find the page set with the fastest +average memory latency. Unfortunately, the obvious +algorithm is exponential in the number of pages. +.TSTART +.G1 +.so memhier-color.d +.G2 +.FEND "Page Coloring Effects" +.LP +One observation is that cache misses are usually +much more expensive than cache hits. So, one +possibility is to choose a random set of pages +as the baseline and measure the average memory +latency. Then iterate over the pages, removing +that page from the set and measuring the average +memory latency of the reduced set. If that page +collides with another page, then the average +memory latency for the reduced set should be smaller +than the average latency for the whole set. +.LP +Once a page that collides has been identified, then +the system can iterate through available pages, +try adding them to the reduced set and measuring +the average memory latency. If the page doesn't +collide with any pages in the reduced set, then +the average memory latency should drop still further. +In this way, the system could identify all +colliding pages and replace them with pages +that don't collide (assuming the memory all +fits in the cache). +.LP +There are a number of problems with this simple approach. +First of all, it would take a very long time to run due +to the large, but polynomial, number of experiments required. +Secondly, as the memory size increases and the +number of pages involved gets large, the effect +of a single page on the average memory latency +can reach the level of experimental noise. +.LP +This approach makes the assumption that physical +page locations do not change once the memory +has been allocated. In most systems, this +assumption is valid unless the memory is paged +to disk. However, at least IRIX includes an +operating system configuration option to allow +the operating system to dynamically relocate +pages in memory. This capability is disabled +by default, so its use is relatively uncommon. +It is possible that page relocation will become +more common in the future, in which case this +design may need to be revisited in the future. +.LP +Our algorithm uses this basic approach, but +attempts to reduce the number of experiments +required by removing chunks of pages at a time. +It will remove up to 5% of pages at a time +and see if the average memory latency decreases +significantly, in which case it examines the +chunk a page at a time to find the page or +pages which probably conflict. +.LP +An additional problem is that for large caches, +the measured difference between two sets of +pages with just one page collision difference +can be very hard to measure. For example, +on a system with a 512Kbyte L2 cache and 4Kbyte +pages, the cache can hold 128 pages. Assuming +that a cache miss is 200ns, a cache hit is 50ns, +and 123 pages have no collisions but 5 pages +collide, then the average memory latency is +.EQ +t bar = { 123 times 50 + 5 times 200 } over 128 +.EN +or 55.85ns. Suppose we remove one page and +replace it with another page which doesn't +collide, so we now have 4 collisions and +124 pages without collisions, then the +average memory latency is 54.68ns. The +difference is generally significant even +in the face of experimental noise, but for +larger caches the differences may recede +into the background noise. +.LP +As caches increase in size, the problems +associated with detecting page collisions +can only increase. +For example, an 8MB cache on a system with +4KB pages would contain 2,048 pages. +Removing a single page collision, even when +the resulting memory latency for that page +reduces by a factor of four, would simply +result in an overall reduction in average +memory latency of less than 0.2%, which is +smaller than the average experimental measurement +errors. +.LP +Additionally, as caches increase in size, +effects such as cache consumption by the +page table can begin to become important. +.LP +The single largest remaining problem in our +system is that this algorithm does not +guarantee that we find a set of pages +which do not contain any collisions in all +cases that it \fImight\fR find such a set. +It merely does so \fImost\fR of the time +with (relatively) few measurements. +.LP +One possible means of dealing with this +problem is to try an remove sets of pages +in the hope that enough pages from a set +of colliding pages will be removed at +once, so that the remaining pages from +that collision set won't collide anymore. +Suppose you have a 4-way set associative +cache, and that you have six pages that +collide. If you remove two of the pages, +then the remaining four pages don't collide +anymore either. This means that by +removing two pages we have removed six +collisions, which should be easier to +detect. +.LP +XXX Look into randomizing the pages +after each iteration of the top-level +loop to make this sort of serendipitious +event more likely. +.NH 2 +Measurement +.LP +In order to reduce the number of memory sizes +that are measured by the system, we use a +binary search on memory sizes to find "edges" +in the memory latency. +We make the simplifying assumption that cache +sizes are either a power of two, or 1.5 times +a power of two. In our experience, this assumption +has been true. +We also assume that no cache is smaller than +512 bytes. +.LP +We explore the memory space at intervals +equivalent to the most recent power of two +divided by four. So, starting at one +megabyte we would (potentially) measure +memory latency at 1MB, 1.25MB, 1.5MB, and +1.75MB. This allows us to detect +cache sizes at the desired intervals, since +the measurement at the exact cache size +can often be corrupted by other system +activity so the next smaller measurement +should still be valid. +.LP +XXX If the measurement size increment is +several times larger than a page, then +perhaps we should actually measure the +system with a couple pages less than the +stated size? +This would allow us some "slop" for +collisions and might make it easier near +cache boundaries to get accurate +measurements. +The "slop" should probably be some fraction +of the measurement increment size, such as +10%, so it scales properly. +.LP +Since we start with a maximum size as a given, +and we use 512 bytes as a minimum, and we can +compute the full set of possible measurements, +and initialize an array with the desired sizes. +We can then use a modified binary search on +this array to efficiently locate cache edges +while still (potentially) leaving large, flat +plateaus unexplored between the end points. +.LP +Finally, we assume that true memory latency +is monotonically increasing with the amount +of memory that you access. +This means that if the measured latency ever +decreases as you increase the amount of +accessed memory, then the previous measurement +must have been an error and the value is +replaced by the smaller measurement. +.NH 2 +Data analysis +.LP +Assuming the data collected by the system +were noise-free and that the experimental +system had managed to eliminate all artifacts +such as page coloring effects, then the +next problem is to analyze the data to find +the number and size of the caches. +Basically this means examining the data to +find plateaus and cliffs. +Each plateau would represent a cache, and the +cliff represents the edge (size) of the cache. +.LP +Of course, real data is never perfect, and +there are any number of issues which can +affect the experimental results, so the +analysis methodology must be robust to noise. +.LP +XXX describe analysis methodology here +.NH 1 +Cache associativity +.LP +No modern caches are fully associative, meaning that +no caches use LRU replacement, because the performance +of such caches is insufficient. Most caches are +either set associative or direct mapped, meaning +that data from a given location can only go to +one of a small number of cache lines, and in the +case of a direct-mapped cache to a single cache line. +.LP +To determine the cache associativity we need to find +a set of pages which have no page collisions and +which (just) fit into the cache. We then need to +locate a page which collides with these pages and +append it to the set. +Then we can iterate through the pages in the initial +page set, removing a page at a time, and comparing +the resulting average memory latency with that of +the full set. +When the average memory latency drops significantly, +then we know that this page conflicts with the +full page set, and since the page set only has one +conflict, we know it conflicts with the newly +introduced page. +The number of pages that conflict with this newly +introduced page is the set associativity. +.LP +There is a potential bug in this algorithm +for systems with victim caches! +If the victim cache can hold at least a page +of data, then this algorithm cannot properly +determine the cache associativity because the +victim cache will play the role of additional +associative cache lines. +.LP +For smaller caches there is the additional +problem that the cache associativity may not +be smaller than the number of pages that the +cache may hold. +In which case, this simple approach will +never find pages that collide in the cache. +The solution to this problem is to increase +the line size and the number of pages so that +only portions of each page are accessed, and +there can be enough pages to create collisions. +.NH 1 +Memory parallelism +.LP +With the increasing memory bottleneck, most modern +systems allow multiple outstanding memory references. +On many systems, the effective parallelism depends +on which part of the memory hierarchy is being +accessed. For example, L1 caches can often service +as many as six or eight outstanding requests, while main +memory systems can usually support at most two +outstanding requests. +.LP +To measure the available parallelism for a given +chunk of memory, the system sets up a pointer +chain running through the memory exactly the same +as if it were to measure the average memory +latency. It then uses fifteen different access +routines, one for each possible level of parallelism.\** +.FS +The assumption here is that no memory subsystem +supports more than sixteen accesses in parallel. +.FE +Each routine dereferences $N$ pointers in parallel. +For example, the inner loop of the routine where +$N=2$ would look something like this: +.DS +while (iterations-- > 0) { + p0 = (char**)*p0; + p1 = (char**)*p1; +} +.DE +.LP +The available parallelism is the maximum speedup +over all N compared to the sequential case. +.LP +Note that this value is often not integral because +many factors go into the effective parallelism, +such as TLB contention, can limit the effective +parallelism. +.NH 1 +Conclusion +.LP +\*[lmbench] is a useful, portable micro-benchmark +suite designed to measure important aspects of +system performance. +\*[lmbench3] adds a number of important extensions, +such as the ability to measure system scalability. +.NH 1 +Acknowledgments +.LP +Many people have provided invaluable help and insight into both the +benchmarks themselves and the paper. The \s-1USENIX\s0 reviewers +were especially helpful. +We thank all of them +and especially thank: +Wayne Scott \s-1(BitMover)\s0, +Larry McVoy \s-1(BitMover)\s0, +and +Bruce Chapman \s-1(SUN)\s0. +.LP +We would also like to thank all of the people that have run the +benchmark and contributed their results; none of this would have been possible +without their assistance. +.LP +Our thanks to +all of the free software community for tools that were used during this +project. +\*[lmbench] is currently developed on Linux, a copylefted Unix written by +Linus Torvalds and his band of happy hackers. +This paper and all of the +\*[lmbench] documentation was produced using +the \f(CWgroff\fP suite of tools written by James Clark. +Finally, all of the data processing of the results is done with +\f(CWperl\fP written by Larry Wall. +.NH 1 +Obtaining the benchmarks +.LP +The benchmarks are available at +.ft I +http://ftp.bitmover.com/lmbench +.ft +.\" .R1 +.\" bibliography references-lmbench3 +.\" .R2 +.\"******************************************************************** +.\" Redefine the IP paragraph format so it won't insert a useless line +.\" break when the paragraph tag is longer than the indent distance +.\" +.de @IP +.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) +.par*start \\n[\\n[.ev]:ai] 0 +.if !'\\$1'' \{\ +. \" Divert the label so as to freeze any spaces. +. di par*label +. in 0 +. nf +\&\\$1 +. di +. in +. fi +. chop par*label +. ti -\\n[\\n[.ev]:ai]u +. ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c +. el \{\ +\\*[par*label] +.\". br +. \} +. rm par*label +.\} +.. +.\"******************************************************************** +.\" redefine the way the reference tag is printed so it is enclosed in +.\" square brackets +.\" +.de ref*end-print +.ie d [F .IP "[\\*([F]" 2 +.el .XP +\\*[ref*string] +.. +.\"******************************************************************** +.\" Get journal number entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-N +.ref*field N "" ( ) +.. +.\"******************************************************************** +.\" Get journal volume entries right. Now will print as V(N) rather +.\" than the awful V, N. +.\" +.de ref*add-V +.ref*field V , "" "" "" +.. +.\"******************************************************************** +.\" Get the date entry right. Should not be enclosed in parentheses. +.\" +.de ref*add-D +.ref*field D "," +.. +.R1 +accumulate +sort A+DT +database references-userguide +label-in-text +label A.nD.y-2 +bracket-label [ ] ", " +bibliography references-userguide +.R2 +.\" .so bios diff --git a/performance/lmbench3/hbench-REBUTTAL b/performance/lmbench3/hbench-REBUTTAL new file mode 100644 index 0000000..b5788a7 --- /dev/null +++ b/performance/lmbench3/hbench-REBUTTAL @@ -0,0 +1,245 @@ +In June of 1997, Margo Seltzer and Aaron Brown published a paper in +Sigmetrics called "Operating System Benchmarking in the Wake of Lmbench: +A Case Study of the Performance of NetBSD on the Intel x86 Architecture". + + +This papers claims to have found flaws in the original lmbench work. +With the exception of one bug, which we have of course fixed, we find +the claims inaccurate, misleading, and petty. We don't understand +what appears to be a pointless attack on something that has obviously +helped many researchers and industry people alike. lmbench was warmly +received and is widely used and referenced. We stand firmly behind the +work and results of the original benchmark. We continue to improve and +extend the benchmark. Our focus continues to be on providing a useful, +accurate, portable benchmark suite that is widely used. As always, we +welcome constructive feedback. + + +To ease the concerns of gentle benchmarkers around the world, we have +spent at least 4 weeks reverifying the results. We modified lmbench to +eliminate any effects of + + . clock resolution + . loop overhead + . timing interface overhead + +Our prediction was that that this would not make any difference and our +prediction was correct. All of the results reported in lmbench 1.x are +valid except the file reread benchmark which may be 20% optimistic on +some platforms. + +We've spent a great deal of time and energy, for free, at the expense +of our full time jobs, to address the issues raised by hbench. We feel +that we were needlessly forced into a lose/lose situation of arguing +with a fellow researcher. We intend no disrespect towards their work, +but did not feel that it was appropriate for what we see as incorrect +and misleading claims to go unanswered. + +We wish to move on to the more interesting and fruitful work of extending +lmbench in substantial ways. + +Larry McVoy & Carl Staelin, June 1997 + +-------------------------------------------------------------------------- + +Detailed responses to their claims: + +Claim 1: + + "it did not have the statistical rigor and self-consistency + needed for detailed architectural studies" + +Reply: + + This is an unsubstantiated claim. There are no numbers which back + up this claim. + +Claim 2: + + "with a reasonable compiler, the test designed to read and touch + data from the file system buffer cache never actually touched + the data" + +Reply: + + Yes, this was a bug in lmbench 1.0. It has been fixed. + On platforms such as a 120 Mhz Pentium, we see change of a 20% + in the results, i.e., without the bug fix it is about 20% faster. + +Claim 3: + + This is a multi part claim: + + a) gettimeofday() is too coarse. + +Reply: + + The implication is that there are number of benchmarks in + lmbench that finish in less time than the clock resolution + with correspondingly incorrect results. There is exactly one + benchmark, TCP connection latency, where this is true and that + is by design, not by mistake. All other tests run long enough + to overcome 10ms clocks (most modern clocks are microsecond + resolution). + + Seltzer/Brown point out that lmbench 1.x couldn't accurately + measure the L1/L2 cache bandwidths. lmbench 1.x didn't attempt + to report L1/L2 cache bandwidths so it would seem a little + unreasonable to imply inaccuracy in something the benchmark + didn't measure. It's not hard to get this right by the way, we + do so handily in lmbench 2.0. + + + b) TCP connection latency is reported as 0 on the DEC Alpha. + +Reply: + + We could have easily run the TCP latency connection benchmark in + a loop long enough to overcome the clock resolution. We were, + and are, well aware of the problem on DEC Alpha boxes. We run + only a few interations of this benchmark because the benchmark + causes a large number of sockets to get stuck in TIME_WAIT, + part of the TCP shutdown protocol. Almost all protocol stacks + degrade somewhat in performance when there are large numbers of + old sockets in their queues. We felt that showing the degraded + performance was not representative of what users would see. + So we run only for a small number (about 1000) interations and + report the result. We would not consider changing the benchmark + the correct answer - DEC needs to fix their clocks if they wish + to see accurate results for this test. + + We would welcome a portable solution to this problem. Reading + hardware specific cycle counters is not portable. + +Claim 4: + + "lmbench [..] was inconsistent in its statistical treatment of + the data" + ... + "The most-used statistical policy in lmbench is to take the + minimum of a few repetitions of the measurement" + +Reply: + + Both of these claims are false, as can be seen by a quick inspection + of the code. The most commonly used timing method (16/19 tests + use this) is + + start_timing + do the test N times + stop_timing + report results in terms of duration / N + + In fact, the /only/ case where a minimum is used is in the + context switch test. + + The claim goes on to try and say that taking the minimum causes + incorrect results in the case of the context switch test. + Another unsupportable claim, one that shows a clear lack of + understanding of the context switch test. The real issue is cache + conflicts due to page placement in the cache. Page placement is + something not under our control, it is under the control of the + operating system. We did not, and do not, subscribe to the theory + that one should use better ``statistical methods'' to eliminate + the variance in the context switch benchmark. The variance is + what actually happened and happens to real applications. + + The authors also claim "if the virtually-contiguous pages of + the buffer are randomly assigned to physical addresses, as they + are in many systems, ... then there is a good probability that + pages of the buffer will conflict in the cache". + + We agree with the second part but heartily disagree with + the first. It's true that NetBSD doesn't solve this problem. + It doesn't follow that others don't. Any vendor supplied + operating system that didn't do this on a direct mapped L2 + cache would suffer dramatically compared to it's competition. + We know for a fact that Solaris, IRIX, and HPUX do this. + + A final claim is that they produced a modified version of the + context switch benchmark that does not have the variance of + the lmbench version. We could not support this. We ran that + benchmark on an SGI MP and saw the same variance as the original + benchmark. + +Claim 5: + + "The lmbench bandwidth tests use inconsistent methods of accessing + memory, making it hard to directly compare the results of, say + memory read bandwidth with memory write bandwidth, or file reread + bandwidth with memory copy bandwidth" + ... + "On the Alpha processor, memory read bandwidth via array indexing + is 26% faster than via pointer indirection; the Pentium Pro is + 67% faster when reading with array indexing, and an unpipelined + i386 is about 10% slower when writing with pointer indirection" + +Reply: + In reading that, it would appear that they are suggesting that + their numbers are up to 67% different than the lmbench numbers. + We can only assume that this was delibrately misleading. + Our results are identical to theirs. How can this be? + + . We used array indexing for reads, so did they. + They /implied/ that we did it differently, when in fact + we use exactly the same technique. They get about + 87MB/sec on reads on a P6, so do we. We challenge + the authors to demonstrate the implied 67% difference + between their numbers and ours. In fact, we challenge + them to demonstrate a 1% difference. + + . We use pointers for writes exactly because we wanted + comparable numbers. The read case is a load and + an integer add per word. If we used array indexing + for the stores, it would be only a store per word. + On older systems, the stores can appear to go faster + because the load/add is slower than a single store. + + While the authors did their best to confuse the issue, the + results speak for themselves. We coded up the write benchmark + our way and their way. Results for a Intel P6: + + pointer array difference + L1 $ 587 710 18% + L2 $ 414 398 4% + memory 53 53 0% + + +Claim 5a: + The harmonic mean stuff. + +Reply: + They just don't understand modern architectures. The harmonic mean + theory is fine if and only if the process can't do two things at + once. Many modern processors can indeed do more than one thing at + once, the concept is known as super scalar, and can and does include + load/store units. If the processor supports both outstanding loads + and outstanding stores, the harmonic mean theory fails. + +Claim 6: + + "we modified the memory copy bandwidth to use the same size + data types as the memory read and write benchmark (which use the + machine's native word size); originally, on 32-bit machines, the + copy benchmark used 64-bit types whereas the memory read/write + bandwidth tests used 32- bit types" + +Reply: + + The change was to use 32 bit types for bcopy. On even relatively + modern systems, such as a 586, this change has no impact - the + benchmark is bound by memory sub systems. On older systems, the + use of multiple load/store instructions, as required for the smaller + types, resulted in lower results than the meory system could produce. + + The processor cycles required actually slow down the results. This + is still true today for in cache numbers. For example, an R10K + shows L1 cache bandwidths of 750MB/sec and 377MB/sec with 64 bit + vs 32 bit loads. It was our intention to show the larger number and + that requires the larger types. + + Perhaps because the authors have not ported their benchmark to + non-Intel platforms, they have not noticed this. The Intel + platform does not have native 64 bit types so it does two + load/stores for what C says is a 64 bit type. Just because it + makes no difference on Intel does not mean it makes no difference. diff --git a/performance/lmbench3/results/Makefile b/performance/lmbench3/results/Makefile new file mode 100644 index 0000000..024916a --- /dev/null +++ b/performance/lmbench3/results/Makefile @@ -0,0 +1,320 @@ +# Makefile for lmbench results. +# $Id: Makefile 1.11 00/01/31 16:29:28-08:00 lm@xxxxxxxxxxxxxxx $ +# +# Usage: make [ LIST="aix/* sunos/* ..." ] [ what ] +# +# What to make: +# print Prints the results 1 per page. +# ps Saves the postscript of 1 per page in PS/PS +# 4.ps Saves the postscript of 4 per page in PS/PS4 +# 8.ps Saves the postscript of 8 per page in PS/PS8 +# x Previews 1 per page using groff -X +# summary [default] Ascii summary of the results +# stats Do statistics over a set of results +# roff Print the ascii summaries into a roff file +# slides Makes the pic for inclusion in slides +# +# This Makefile requires groff, gpic, and perl. You could try it with +# other *roff processors; I have no idea if it works. +# +# XXX - this is all out of date. +# +# There are three sorts of graphical results: +# +# 1. Bargraphs comparing each system in the LIST on the measurements listed +# in the BG list below (pretty much everything). +# 2. A 2-D graph for each system in LIST, displaying context switch times +# as a function of (# of processes, size of each process). +# 3. A 2-D graph for each system in LIST, displaying memory read times as +# a function of (stride size, memory size). +# +# The bargraphs are in a format of my own - the perl script in scripts +# called bargraph takes them as input and produces pic as output. +# It is a pretty straightforward format, you could probably incorparate +# into some Windows spreadsheet if you wanted to. See tmp/*.bg after +# running make in this directory. +# +# The 2-D graphs are in a format that can (probably) be read by Xgraph. +# I've added a few extensions for titles, etc., that you could just +# take out. See tmp/mem.* after running a make in this directory. +# +# This Makefile is of marginal usefulness to a site with just one machine. +# I intend to make results available so that people can compare, as well +# as a service where you can compare your results against the "best of +# the breed" for each vendor, as well as against best of the lot. + +# List of result files to process. Defaults to everything. +LIST= `$(SCRIPTS)getlist $(LST)` + +# Grrrrr +SHELL=/bin/sh + +SCRIPTS=../scripts/ +SRCS= ../scripts/allctx ../scripts/allmem ../scripts/bargraph \ + ../scripts/bghtml ../scripts/getbg ../scripts/getbw \ + ../scripts/getctx ../scripts/getdisk ../scripts/getlist \ + ../scripts/getmax ../scripts/getmem ../scripts/getpercent \ + ../scripts/getresults ../scripts/getsummary ../scripts/gifs \ + ../scripts/graph ../scripts/html-list ../scripts/html-man \ + ../scripts/os ../scripts/percent ../scripts/save \ + ../scripts/stats ../scripts/xroff + +MISC= tmp/misc_mhz.bg \ + tmp/lat_ctx.bg \ + tmp/lat_ctx8.bg \ + tmp/lat_nullsys.bg \ + tmp/lat_signal.bg \ + tmp/lat_pagefault.bg \ + tmp/lat_mappings.bg \ + tmp/lat_fs_create.bg + +PROC= tmp/lat_nullproc.bg \ + tmp/lat_simpleproc.bg \ + tmp/lat_shproc.bg + +LATENCY= \ + tmp/lat_pipe.bg \ + tmp/lat_connect.bg \ + tmp/lat_udp_local.bg \ + tmp/lat_rpc_udp_local.bg \ + tmp/lat_tcp_local.bg \ + tmp/lat_rpc_tcp_local.bg + +BANDWIDTH= \ + tmp/bw_pipe.bg \ + tmp/bw_tcp_local.bg \ + tmp/bw_file.bg \ + tmp/bw_reread.bg \ + tmp/bw_mmap.bg \ + tmp/bw_bcopy_libc.bg \ + tmp/bw_bcopy_unrolled.bg \ + tmp/bw_mem_rdsum.bg \ + tmp/bw_mem_wr.bg + +BG= $(MISC) $(PROC) $(LATENCY) $(BANDWIDTH) + +MK=@$(MAKE) -s +PRINT=groff -p | lpr -h +PS=groff -p | $(SCRIPTS)save PS/PS +PS8UP=groff -p | mpage -P- -8 -a | $(SCRIPTS)save PS/PS8 +PS4UP=groff -p | mpage -P- -4 -a | $(SCRIPTS)save PS/PS4 +SIZE=-big +IMAGE=pbm +CLOSE= +GMEM=$(CLOSE) -grid -logx -xm -below +GCTX=$(CLOSE) -grid -below +GDISK=-below -close -grid -nolines +#IMAGE=gifmono + +summary: $(SRCS) + @$(SCRIPTS)getsummary $(LIST) + +percent: $(SRCS) + @$(SCRIPTS)getpercent $(LIST) + +stats: $(SRCS) + $(SCRIPTS)getsummary $(LIST) | $(SCRIPTS)percent + +roff: + echo .nf > summary.roff + echo .ft CB >> summary.roff + echo .ps 12 >> summary.roff + echo .po .35i >> summary.roff + echo .sp .5i >> summary.roff + make LIST="$(LIST)" summary >> summary.roff + echo .bp >> summary.roff + echo .sp .5i >> summary.roff + make LIST="$(LIST)" percent >> summary.roff + +list: + @echo $(LIST) + +print: ctx mem disk bwfile bwmem + +8: + $(MK) LIST="$(LIST)" PRINT="groff -p | mpage -P -8 -a | lpr -h" print + +8.ps 8ps 8up: + $(MK) LIST="$(LIST)" PRINT="$(PS8UP)" print + +4.ps 4ps 4up: + $(MK) LIST="$(LIST)" PRINT="$(PS4UP)" print + +ps: + $(MK) LIST="$(LIST)" PRINT="$(PS)" print + +smallps: + $(MK) LIST="$(LIST)" SIZE= PRINT="groff -p | $(SCRIPTS)save PS/smallPS" print + +x: + $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" print + +ctx.x: + $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" ctx + +ctx.ps4: + $(MK) LIST="$(LIST)" PRINT="$(PS4UP)" ctx + +mem.x: + $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" mem + +disk.x: + $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" disk + +bwfile.ps: + $(MK) LIST="$(LIST)" PRINT="$(PS)" bwfile + +bwfile.x: + $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" bwfile + +bwmem.ps: + $(MK) LIST="$(LIST)" PRINT="$(PS)" bwmem + +bwmem.x: + $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" bwmem + +smallx: + $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" SIZE= print + +slides: + $(MK) LIST="$(LIST)" SIZE=-slide bargraphs.slides ctx.slides mem.slides + +paper: + $(MK) LIST="$(LIST)" tbl.paper ctx.paper mem.paper + +# XXX - this has to be made incremental, doing everything over from +# scratch makes you want a Ghz machine. +html: + -make clean + #$(SCRIPTS)bghtml $(BG) + $(SCRIPTS)html-list $(LIST) + $(MK) LIST="$(LIST)" summary > HTML/summary + #make LIST="$(LIST)" percent > HTML/percent + $(MK) LIST="$(LIST)" SIZE= PRINT="$(PS)" \ + GMEM="$(GMEM) -cut -gthk1" GCTX="$(GCTX) -cut -gthk1" print + $(MK) LIST="$(LIST)" SIZE= NOOP=-noop PRINT="$(PS)" \ + GMEM="$(GMEM) -cut -gthk1" GCTX="$(GCTX) -cut -gthk1" print + gs -sOutputFile=HTML/ctx%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS < /dev/null + gs -sOutputFile=HTML/mem%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.1 < /dev/null + gs -sOutputFile=HTML/disk%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.2 < /dev/null + gs -sOutputFile=HTML/bwfile%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.3 < /dev/null + gs -sOutputFile=HTML/bwmem%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.4 < /dev/null + gs -sOutputFile=HTML/ctx-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.5 < /dev/null + gs -sOutputFile=HTML/mem-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.6 < /dev/null + gs -sOutputFile=HTML/bwfile-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.7 < /dev/null + gs -sOutputFile=HTML/bwmem-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.8 < /dev/null + $(SCRIPTS)/gifs + rm HTML/*.pbm HTML/___tmp* + +bghtml: + $(SCRIPTS)bghtml $(BG) + +html-list: + $(SCRIPTS)html-list $(LIST) + +ctx: dirs + $(SCRIPTS)getctx $(LIST) > tmp/FILES + @if [ -s tmp/FILES ]; \ + then $(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \ + for i in `cat tmp/FILES`; \ + do $(SCRIPTS)graph $(SIZE) $(GCTX) $$i; \ + echo .bp; \ + done | sed '$$d' | $(PRINT); \ + else echo No context switch data in $(LIST); \ + fi + +disk: dirs + if [ X$(NOOP) = X ]; then \ + $(SCRIPTS)getdisk $(LIST) > tmp/FILES; \ + if [ -s tmp/FILES ]; \ + then for i in `cat tmp/FILES`; \ + do $(SCRIPTS)graph $(SIZE) $(GDISK) $$i; \ + echo .bp; \ + done | sed '$$d' | $(PRINT); \ + else echo No disk data in $(LIST); \ + fi; \ + fi + +mem: dirs + $(SCRIPTS)getmem $(LIST) > tmp/FILES + if [ -s tmp/FILES ]; \ + then $(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \ + for i in `cat tmp/FILES`; \ + do $(SCRIPTS)graph $(SIZE) $(GMEM) -nomarks $$i; \ + echo .bp; \ + done | sed '$$d' | $(PRINT); \ + else echo No memory latency data in $(LIST); \ + fi + +bwfile: dirs + $(SCRIPTS)getbw $(LIST) > tmp/FILES + if [ -s tmp/FILES ]; \ + then $(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \ + for i in `cat tmp/FILES`; \ + do $(SCRIPTS)graph $(SIZE) $(GMEM) -logy $$i; \ + echo .bp; \ + done | sed '$$d' | $(PRINT); \ + else echo No file bandwidth data in $(LIST); \ + fi + +bwmem: dirs + $(SCRIPTS)getbw -all $(LIST) > tmp/FILES + if [ -s tmp/FILES ]; \ + then $(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \ + for i in `cat tmp/FILES`; \ + do $(SCRIPTS)graph -halfgrid -gthk_5 -thk2 -medium \ + -nomarks -nolabels -grapheach $(GMEM) \ + -logy %P="'`basename $$i`'" $$i; \ + echo .bp; \ + done | sed '$$d' | $(PRINT); \ + else echo No memory bandwidth data in $(LIST); \ + fi + +tbl.paper: + $(SCRIPTS)getbg -paper $(LIST) + + +bargraphs.1st: dirs + $(SCRIPTS)getbg -nosort $(LIST) + #$(SCRIPTS)getmax -v $(PROC) + #$(SCRIPTS)getmax -v $(LATENCY) + #$(SCRIPTS)getmax -v -half $(BANDWIDTH) + +bargraphs: bargraphs.1st + for i in $(BG); \ + do $(SCRIPTS)bargraph $(SIZE) -nobox -sideways $$i; \ + echo .bp; \ + done | sed '$$d' | $(PRINT) + +bargraphs.slides: bargraphs.1st + for i in $(BG); \ + do $(SCRIPTS)bargraph $(SIZE) -nobox -sideways $$i > $${i}.pic; \ + done + +bargraphs.8up: bargraphs.1st + for i in $(BG); \ + do $(SCRIPTS)bargraph -sideways $(SIZE) -nobox $$i; \ + echo .bp; \ + done | sed '$$d' | $(PS8UP) + +latency.8up: bargraphs.1st + for i in $(LATENCY); \ + do $(SCRIPTS)bargraph -sideways $(SIZE) -nobox $$i; \ + echo .bp; \ + done | sed '$$d' | $(PS8UP) + +bw.8up: bargraphs.1st + for i in $(BANDWIDTH); \ + do $(SCRIPTS)bargraph -sideways $(SIZE) -nobox $$i; \ + echo .bp; \ + done | sed '$$d' | $(PS8UP) + +get: # nothing to do + +clean: + /bin/rm -f PS/* GIF/* HTML/* tmp/* summary.roff + +dirs: + @if [ ! -d tmp ]; then mkdir tmp; fi + @if [ ! -d PS ]; then mkdir PS; fi + @if [ ! -d HTML ]; then mkdir HTML; fi diff --git a/performance/lmbench3/runtest.sh b/performance/lmbench3/runtest.sh new file mode 100755 index 0000000..3a81c6d --- /dev/null +++ b/performance/lmbench3/runtest.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +if [ -f bin/$(uname -m)-linux-gnu/$(scripts/config) ]; then + make rerun +else + make + make results +fi + +cd results +make summary + +exit 0 diff --git a/performance/lmbench3/scripts/Makefile b/performance/lmbench3/scripts/Makefile new file mode 100644 index 0000000..7abca50 --- /dev/null +++ b/performance/lmbench3/scripts/Makefile @@ -0,0 +1,8 @@ +# Makefile for lmbench scripts subdir. +#$Id: Makefile 1.3 00/01/31 16:29:28-08:00 lm@xxxxxxxxxxxxxxx $ + +get: + get -s + +clean: + diff --git a/performance/lmbench3/scripts/README b/performance/lmbench3/scripts/README new file mode 100644 index 0000000..6e84ad1 --- /dev/null +++ b/performance/lmbench3/scripts/README @@ -0,0 +1,7 @@ +$Id: README 1.2 97/06/14 21:10:42-07:00 lm@xxxxxxxxxxxxxxx $ + +This directory contains scripts used to generate or post process lmbench +output. You probably do not want to be here or run these by hand, the +Makefiles in ../src and ../results invoke these. There are some useful +scripts here, however, in particular the graphing scripts. If you are +interested in groff graphing tools, check out ../doc/*graph.1. diff --git a/performance/lmbench3/scripts/SHIT b/performance/lmbench3/scripts/SHIT new file mode 100644 index 0000000..de2a060 --- /dev/null +++ b/performance/lmbench3/scripts/SHIT @@ -0,0 +1,724 @@ + +# Go find perl if we are running this as a shell script. +eval 'exec perl -Ssw $0 "$@"' + if 0; + +# Mimic the BSD tool, sccs, for RCS. +# $Id: SHIT 1.2 95/11/29 12:39:38-08:00 lm@xxxxxxxxxxxxxxx $ +# +# Note - this reflects a lot of my personal taste. I'll try and list the +# important differences here: +# +# A bunch of unused commands are not implemented. It is easy to add them, +# mail me if you want me to add something. Please include a spec of what +# you want the command to do. Mail lm@xxxxxxxxxxxx. +# +# I look at RCS file internals and know about certain fields as of revision +# 5.x. +# +# This interface does not require a list of files/directories for most +# commands; the implied list is *,v and/or RCS/*,v. Destructive commands, +# such as clean -f, unedit, unget, do *not* have an implied list. In +# other words, +# rccs diffs is the same as rccs diffs RCS +# but +# rccs unedit is not the same as rccs unedit RCS +# +# If you add (potentially) destructive commands, please check for +# them in main() and make sure that the autoexpand does not happen. +# +# TODO: +# Make it so that you can pass a list of files/dirs via stdin. +# +# It might be nice to have all the "system" args printed out in +# verbose and/or learn mode. Depends on whether you want people +# to learn RCS or not. + +&init; +&main; + +sub init +{ + $0 =~ s|.*/||; + # Add commands here so that -w shuts up. + $lint = 0; + + &clean() && &create() && &example() && &get() && &edit() && + &unedit() && &unget() && &diffs() && &delta() && &help() && + &prs() && &prt() && &deledit() && &delget() && &enter() && + &info() && &ci() && &co() && &fix() && &print() + if $lint; +} + +sub help +{ + if ($#_ == -1) { + &usage; + } + + # Handle all the aliases. + if ($_[0] eq "unedit" || $_[0] eq "unget") { + &help("clean"); + } elsif ($_[0] eq "clean") { + } + warn "Extended help on @_ not available yet.\n"; +} + +sub usage +{ +print <<EOF; + +usage: $0 [$0 opts] command [args] [file and/or directory list] + +$0 options are: + -debug for debugging of $0 itself + -verbose for more information about what $0 is doing + +More information may be had by saying "$0 help subcommand". + +Most commands take "-s" to mean do the work silently. + +Command Effect +------- ------ + clean - remove unedited (ro) working files + -e remove unmodified edited (rw) & unedited (ro) files + -f (force) remove modified working files as well + create - add a set of files to RCS control and get (co) the working files + -g do not do the get (co) of the working files + -y<msg> use <msg> as the description message (aka -d<msg>) + delta - check in a revision + -y<msg> use <msg> as the log message (aka -d<msg>) + -s + diffs - diff the working file against the RCS file + fix - redit the last revision + get - get the working file[s] (possibly for editing) + history - print history of the files + print - print the history and the latest contents + +Alias Real command Effect +----- ------------ ------ + ci - delta check in a revision + co - get check out a revision + enter - create -g initialize a file without a get afterward + unedit - clean -f remove working file even if modified + unget - clean -f remove working file even if modified + edit - get -e check out the file for editing + prs - history print change log history + prt - history print change log history + +An implied list of *,v and/or RCS/*,v is implied for most commands. +The exceptions are commands that are potentially destructive, such as +unedit. + +EOF + + exit 0; +} + +sub main +{ + local($cmd); + local(@args); + local(@comma_v); + + $cmd = "oops"; + $cmd = shift(@ARGV) if $#ARGV > -1; + &help(@ARGV) if $cmd eq "help" || $cmd eq "oops"; + + $dir_specified = $file_specified = 0; + foreach $_ (@ARGV) { + # If it is an option, just pass it through. + if (/^-/) { + push(@args, $_); + } + # If they specified an RCS directory, explode it into ,v files. + elsif (-d $_) { + $dir_specified = 1; + warn "Exploding $_\n" if $debug; + push(@args, grep(/,v$/, &filelist($_))); + push(@args, grep(/,v$/, &filelist("$_/RCS"))); + } + # If it is a file, make it be the ,v file. + else { + if (!/,v$/) { + # XXX - what if both ./xxx,v and ./RCS/xxx,v? + if (-f "$_,v") { + $_ .= ",v"; + } else { + if (m|/|) { + m|(.*)/(.*)|; + $f = "$1/RCS/$2,v"; + } else { + $f = "RCS/$_,v"; + } + if (-f $f) { + $_ = $f; + } + } + } + if (-f $_) { + $file_specified = 1; + warn "Adding $_\n" if $debug; + push(@args, $_); + } else { + warn "$0: skipping $_, no RCS file.\n"; + } + } + } + + # Figure out if it is a potentially destructive command. These + # commands do not automagically expand *,v and RCS/*,v. + $destructive = ($cmd eq "clean" && $args[0] eq "-f") || + $cmd eq "unedit" || $cmd eq "unget"; + + # If they didn't specify a file or a directory, generate a list + # of all ./*,v and ./RCS/*,v files. + unless ($destructive || $dir_specified || $file_specified) { + warn "Exploding . && ./RCS\n" if $debug; + push(@args, grep(/,v$/, &filelist("."))); + push(@args, grep(/,v$/, &filelist("RCS"))); + } + + unless ($cmd =~ /^create$/) { + @comma_v = grep(/,v$/, @args); + if ($#comma_v == -1) { + ($s = "$cmd @ARGV") =~ s/\s+$//; + die "$0 $s: No RCS files specified.\n"; + } + } + + # Exit codes: + # 0 - it worked + # 1 - unspecified error + # 2 - command unknown + $exit = 2; + warn "Trying &$cmd(@args)\n" if $debug; + eval(&$cmd(@args)); + + if ($exit == 2) { + warn "Possible unknown/unimplemented command: $cmd\n"; + &usage; + } else { + exit $exit; + } +} + +# Read the directory and return a list of files. +# XXX - isn't there a builtin that does this? +sub filelist +{ + local(@entries) = (); + local($ent); + + opendir(DFD, $_[0]) || return (); + foreach $ent (readdir(DFD)) { + $ent = "$_[0]/$ent"; + next unless -f $ent; + push(@entries, $ent); + } + warn "filelist($_[0]): @entries\n" if $debug; + @entries; +} + +# Take a list of ,v files and return a list of associated working files. +sub working +{ + local(@working, $working) = (); + + foreach $comma_v (@_) { + # Strip the ,v. + # Strip the RCS specification. + ($working = $comma_v) =~ s|,v$||; + $working =~ s|RCS/||; + push(@working, $working); + } + @working; +} + +# Same as "clean -f" - throw away all changes +sub unedit { &clean("-f", @_); } +sub unget { &clean("-f", @_); } + +# Get rid of everything that isn't edited and has an associated RCS file. +# -e remove edited files that have not been changed. +# -f remove files that are edited with changes (CAREFUL!) +# This implies the -e opt. +# -d<m> Check in files that have been modified. If no message, prompt +# on each file. This implies -e. +# -y<m> Like -d for people that are used to SCCS. +# -m<m> Like -d for people that are used to RCS. +# +# Note: this does not use rcsclean; I don't know when that showed up. And +# the 5.x release of RCS I have does not install it. +sub clean +{ + local(@working); + local($e_opt, $f_opt, $d_opt, $s_opt) = (0,0,0,0); + local($msg); + local(@checkins) = (); + + while ($_[0] =~ /^-/) { + if ($_[0] eq "-s") { + $s_opt = 1; + shift(@_); + } elsif ($_[0] eq "-e") { + $e_opt = 1; + shift(@_); + } elsif ($_[0] eq "-f") { + $f_opt = $e_opt = 1; + shift(@_); + } elsif ($_[0] =~ /^-[dym]/) { + $d_opt = $e_opt = 1; + if ($_[0] =~ /^-[dym]$/) { + $msg = $_[0]; + } else { + ($msg = $_[0]) =~ s/-[ydm]//; + $msg = "-m'" . $msg . "'"; + } + shift(@_); + } else { + die "$0 clean: unknown option: $_[0]\n"; + } + } + + @working = &working(@_); + for ($i = 0; $i <= $#_; ++$i) { + # No working file? + if (!-f $working[$i]) { + warn "No working file $working[$i] for $_[$i]\n" + if $debug; + next; + } + + # Read only? Unlink. + if (!-w $working[$i]) { + warn "rm $working[$i]\n" unless $s_opt; + # Make sure there is an RCS file + if (-f $_[$i]) { + # XXX - what if ro and edited? + unlink($working[$i]) unless $n; + } else { + warn "clean: no RCS file for $working[$i]\n"; + } + next; + } + + # If they just want to know about it, tell them. + if ($e_opt == 0) { + open(RCS, $_[$i]); + while ($r = <RCS>) { + last if $r =~ /locks/; + } + @locks = (); + while ($r = <RCS>) { + # XXX - I use "comment" a delimiter. + last if $r =~ /comment/; + $r =~ s/^\s+//; + chop($r); + push(@locks, $r); + } + close(RCS); + if ($#locks > -1) { + warn "$working[$i]: being edited: @locks\n"; + } else { + warn "$working[$i]: " . + "writeable but not edited?!?\n"; + } + next; + } + + # See if there have actually been any changes. + # Notice that this is cmp(1) in about 10 lines of perl! + open(RCS, "co -q -p -kkvl $_[$i] |"); + open(WORK, $working[$i]); + $diff = 0; + while ($r = <RCS>) { + unless (($w = <WORK>) && ($r eq $w)) { + $diff = 1; + last; + } + } + if ($w = <WORK>) { + $diff = 1; + } + close(RCS); close(WORK); + if ($diff) { + if ($f_opt) { + warn "Clean modified $working[$i]\n" + unless $s_opt; + unless ($n) { + unlink($working[$i]); + system "rcs -q -u $_[$i]"; + } + } elsif ($d_opt) { + push(@checkins, $_[$i]); + } else { + warn "Can't clean modified $working[$i]\n"; + } + next; + } else { + warn "rm $working[$i]\n" unless $s_opt; + unless ($n) { + unlink($working[$i]); + system "rcs -q -u $_[$i]"; + } + } + } + + # Handle files that needed deltas. + if ($#checkins > -1) { + warn "ci -q $msg @checkins\n" if $verbose; + system "ci -q $msg @checkins"; + } + + $exit = 0; +} + +# Create - initialize the RCS file +# -y<c> - use <c> as the description message for all files. +# -d<c> - use <c> as the description message for all files. +# -g - don't do the get +# +# Differs from sccs in that it does not preserve the original +# files (I never found that very useful). +sub create +{ + local($arg, $noget, $description, $cmd) = ("", "", ""); + + foreach $arg (@_) { + # Options... + if ($arg =~ /^-[yd]/) { + ($description = $arg) =~ s/^-[yd]//; + $arg = ""; + warn "Desc: $description\n" if $debug; + next; + } + if ($arg eq "-g") { + $noget = "yes"; + $arg = ""; + next; + } + next if ($arg =~ /^-/); + + # If no RCS subdir, make one. + if ($arg =~ m|/|) { # full path + ($dir = $arg) =~ s|/[^/]+$||; + mkdir("$dir/RCS", 0775); + } else { # in $CWD + mkdir("RCS", 0775); + } + } + $exit = 0; + if ($description ne "") { + $cmd = "ci -t-'$description' @_"; + } else { + $cmd = "ci @_"; + } + warn "$cmd\n" if $verbose; + system "$cmd"; + system "co @_" unless $noget; +} + +# Like create without the get. +sub enter { &create("-g", @_); } + +# Edit - get the working file editable +sub edit { &get("-e", @_); } + +# co - normal RCS +sub co { &get(@_); } + +# Get - get the working file +# -e Retrieve a version for editing. +# Same as co -l. +# -p Print the file to stdout. +# -k Suppress expansion of ID keywords. +# Like co -kk. +# -s Suppress all output. +# +# Note that all other options are passed to co(1). +sub get +{ + local($arg, $working, $f, $p); + + $f = $p = 0; + foreach $arg (@_) { + # Options... + $arg = "-l" if ($arg eq "-e"); + $arg = "-kk" if ($arg eq "-k"); + $arg = "-q" if ($arg eq "-s"); + $f = 1 if ($arg eq "-f"); + $p = 1 if ($arg eq "-p"); # XXX - what if -sp? + + next if $arg =~ /^-/ || $p; + + # Check for writable files and skip them unless someone asked + # for co's -f option. + ($working = $arg) =~ s|,v$||; + $working =~ s|RCS/||; + if ((-w $working) && $f == 0) { + warn "ERROR [$arg]: writable `$working' exists.\n"; + $arg = ""; + } + } + @files = grep(/,v/, @_); + if ($#files == -1) { + warn "$0 $cmd: no files to get. @_\n"; + $exit = 1; + } else { + system "co @_"; + $exit = 0; + } +} + +# Aliases for history. +sub prt { &history(@_); } +sub prs { &history(@_); } + +# History - change history sub command +sub history +{ + local(@history); + + open(RL, "rlog @_|"); + # Read the whole history + while ($r = <RL>) { + # Read the history for one file. + if ($r !~ /^[=]+$/) { + push(@history, $r); + next; + } + &print_history(@history); + @history = (); + } + close(RL); + print "+-----------------------------------\n"; + $exit = 0; +} + +sub print_history +{ + for ($i = 0; $i <= $#_; ++$i) { + # Get the one time stuff + if ($_[$i] =~ /^RCS file:/) { + $_[$i] =~ s/RCS file:\s*//; + chop($_[$i]); + print "+------ $_[$i] -------\n|\n"; + } + + # Get the history + if ($_[$i] =~ /^----------------------------/) { + local($rev, $date, $author, $lines) = ("", "", "", ""); + + $i++; + die "Bad format\n" unless $_[$i] =~ /revision/; + $_[$i] =~ s/revision\s+//; + chop($_[$i]); + $rev = $_[$i]; + $i++; + die "Bad format\n" unless $_[$i] =~ /date/; + @parts = split(/[\s\n;]+/, $_[$i]); + for ($j = 0; $j <= $#parts; $j++) { + if ($parts[$j] =~ /date/) { + $j++; + $date = "$parts[$j] "; + $j++; + $date .= "$parts[$j]"; + } + if ($parts[$j] =~ /author/) { + $j++; + $author = $parts[$j]; + } + if ($parts[$j] =~ /lines/) { + $j++; + $lines = "$parts[$j] "; + $j++; + $lines .= "$parts[$j]"; + } + } + print "| $rev $date $author $lines\n"; + while ($_[++$i] && + $_[$i] !~ /^----------------------------/) { + print "| $_[$i]"; ### unless $rev =~ /^1\.1$/; + } + print "|\n"; + $i--; + } + } +} + +# Show changes between working file and RCS file +# +# -C -> -c for compat with sccs (not sure if this is needed...). +sub diffs +{ + local(@working); + local($diff) = "diff"; + local($rev) = ""; + + while ($_[0] =~ /^-/) { + if ($_[0] eq "-C") { + $diff .= " -c"; + shift(@_); + } elsif ($_[0] =~ /^-r/) { + $rev = $_[0]; + shift(@_); + } elsif ($_[0] eq "-sdiff") { + # XXX - screen size + $diff = "sdiff -w80"; + shift(@_); + } else { + $diff .= " $_[0]"; + shift(@_); + } + + } + + @working = &working(@_); + for ($i = 0; $i <= $#_; ++$i) { + # No working file? + if (!-f $working[$i]) { + warn "No working file $working[$i] for $_[$i]\n" + if $debug; + next; + } + + # Read only? Skip. + next unless (-w $working[$i]); + + # Show the changes + print "\n------ $working[$i]$rev ------\n"; + fflush(stdout); + # XXX - flush stdout. + if ($diff =~ /^sdiff/) { + system "co -q -p -kkvl $rev $_[$i] > /tmp/sdiff.$$" . + "&& $diff /tmp/sdiff.$$ $working[$i]"; + # XXX - interrupts? + unlink("/tmp/sdiff.$$"); + } else { + system "co -q -p -kkvl $rev $_[$i] |" . + " $diff - $working[$i]"; + } + } + + $exit = 0; +} + +# delta - check in the files +sub delta +{ + local($description) = (""); + local($i, @working); + + @working = &working(@_); + for ($i = 0; $i <= $#_; ++$i) { + # Options... + if ($_[$i] =~ /^-[yd]/) { + ($description = $_[$i]) =~ s/^-[yd]/-m/; + $description = "'" . $description . "'"; + $_[$i] = ""; + next; + } + $_[$i] = "-q" if $_[$i] eq "-s"; + $_[$i] = "" unless -f $working[$i]; + } + $exit = 0; + warn "ci $description @_\n" if $verbose; + system "ci $description @_"; +} + +# Allow RCS interface ci +sub ci +{ + &delta(@_); +} + +# delget +sub delget +{ + &delta(@_); + &get(@_); # If there was a description, delta nuked it... +} + +# deledit +sub deledit +{ + &delta(@_); + &get("-e", @_); # If there was a description, delta nuked it... +} + + +# info - who is editing what +sub info +{ + local(@working); + + @working = &working(@_); + for ($i = 0; $i <= $#_; $i++) { + open(RCS, $_[$i]); + while ($r = <RCS>) { + last if $r =~ /locks/; + } + @locks = (); + while ($r = <RCS>) { + # XXX - I use "comment" a delimter. + last if $r =~ /comment/; + $r =~ s/^\s+//; + chop($r); + push(@locks, $r); + } + close(RCS); + if ($#locks > -1) { + warn "$working[$i]: being edited: @locks\n"; + } + } + $exit = 0; +} + +# Fix - fix the last change to a file +sub fix +{ + foreach $f (@_) { + next unless -f $f; + open(F, $f); while (<F>) { last if /head\s\d/; } close(F); + unless ($_ && /head/) { + warn "$0 $cmd: No head node found in $f\n"; + next; + } + s/head\s+//; chop; chop; $rev = $_; + ($working = $f) =~ s/,v//; + $working =~ s|RCS/||; + system "co -q $f && rcs -o$rev $f && rcs -l $f && chmod +w $working"; + } + $exit = 0; +} + +# print - print the history and the latest revision of the file +sub print +{ + local($file); + + foreach $file (@_) { + &history($file); + &get("-s", "-p", $file); + } + $exit = 0; +} + + +# Example - example sub command +# -Q change this option to -q just to show how. +sub example +{ + local($arg, $working); + + foreach $arg (@_) { + # Options... + $arg = "-Q" if ($arg eq "-q"); + } + warn "rlog @_\n" if $verbose; + system "rlog @_"; + $exit = 0; +} + +RCS bghtml html-list man2html diff --git a/performance/lmbench3/scripts/TODO b/performance/lmbench3/scripts/TODO new file mode 100755 index 0000000..c9430db --- /dev/null +++ b/performance/lmbench3/scripts/TODO @@ -0,0 +1,3 @@ +Make graph take a %T and %T2 and put %T above %T2 + +Or make it take \n in the title and deal. diff --git a/performance/lmbench3/scripts/allctx b/performance/lmbench3/scripts/allctx new file mode 100755 index 0000000..386c5e5 --- /dev/null +++ b/performance/lmbench3/scripts/allctx @@ -0,0 +1,71 @@ + +# Extract the context switching information from lmbench result files. +# Usage: getctx file file.... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: allctx 1.3 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ss $0 "$@"' + if 0; + +$first = 1; +foreach $file (@ARGV) { + open(FD, $file); + $file =~ s|.*/||; + $file =~ s/\.\d+//; + while (<FD>) { + chop; + if (/^\[lmbench/) { + split; + if ($_[3] eq "SunOS") { + $_[3] .= "-$_[5]"; + } + $uname = "@_"; + } + if (/Mhz/) { + $mhz = $_; + } + if (/^.size=/) { + s/size/Process size/; + s/ ovr/\toverhead/; + @info = &getinfo($uname, $mhz); + ($f = $file) =~ s|.*/||; + print "\n" unless $first; + $first = 0; + print "%T $info[3] $info[$#info]Mhz\n"; + print "$_\n"; + while (<FD>) { + last if /^Null/ || /^Pipe/ || /^Memor/; + next if /\$Id/; + s/ ovr/\toverhead/; + s/size/Process size/; + print ; + } + last; + } + } +} +exit 0; + +# Try and create sensible names from uname -a output +sub getinfo +{ + local(@info); + local($name); + local($mhz) = sprintf("%.0f", $_[1]); + + @info = split(/\s+/, $_[0]); + $name = pop(@info); + chop($name); + if ($name eq "mips") { + $name = "$info[$#info]@$mhz"; + } elsif ($_[0] =~ /HP-UX/) { + $name = "$info[7]@$mhz"; + } elsif ($_[0] =~ /SunOS/) { + $name = "$info[7]@$mhz"; + } else { + $name .= "@$mhz"; + } + push(@info, $name); + @info; +} diff --git a/performance/lmbench3/scripts/allmem b/performance/lmbench3/scripts/allmem new file mode 100755 index 0000000..9243873 --- /dev/null +++ b/performance/lmbench3/scripts/allmem @@ -0,0 +1,69 @@ + +# Extract the memory latency graph data from lmbench result files. +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: allmem 1.3 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ss $0 "$@"' + if 0; + +# Uses a stride of 128 +#print "\"%X Array size\n\"%Y Latency in nanoseconds\n"; +foreach $file (@ARGV) { + open(FD, $file); + $file =~ s|.*/||; + while (<FD>) { + chop; + if (/^\[lmbench/) { + split; + if ($_[3] eq "SunOS") { + $_[3] .= "-$_[5]"; + } + $uname = "@_"; + } + if (/Mhz/) { + $mhz = $_; + } + if (/^Memory load latency/) { + @info = &getinfo($uname, $mhz); + ($f = $file) =~ s|.*/||; + print "\"$file $info[3] $info[$#info]\n"; + while (<FD>) { + next unless /^"stride=128/; + last; + } + while (<FD>) { + if (/^\s*$/) { + print "\n"; + last; + } + print; + } + last; + } + } +} +exit 0; + +# Try and create sensible names from uname -a output +sub getinfo +{ + local(@info); + local($name); + local($mhz) = sprintf("%.0f", $_[1]); + + @info = split(/\s+/, $_[0]); + $name = pop(@info); + chop($name); + if ($name eq "mips") { + $name = "$info[$#info]@$mhz"; + } elsif ($_[0] =~ /HP-UX/) { + $name = "$info[7]@$mhz"; + } elsif ($_[0] =~ /SunOS/) { + $name = "$info[7]@$mhz"; + } else { + $name .= "@$mhz"; + } + push(@info, $name); + @info; +} diff --git a/performance/lmbench3/scripts/bargraph b/performance/lmbench3/scripts/bargraph new file mode 100755 index 0000000..f710133 --- /dev/null +++ b/performance/lmbench3/scripts/bargraph @@ -0,0 +1,430 @@ +# $Id: bargraph 1.5 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ss $0 "$@"' + if 0; + +# A simple bargraph preprocessor for GNU pic / troff package. +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# +# TODO +# Make this work with sideways graphs. +# +# Input format is: +# +# 3 foo bar +# 9 bigger foo +# "Silly example +# +# and output is +# +# bigger +# foo +# +----------+ +# | | +# foo | | +# bar | | +# +----------+ | | +# | | | | +# +----------+ +----------+ +# ------------------------------- +# 3 9 +# +# Silly example +# +# Input options: +# specifier value default +# %ps <point size> 10 +# %ft <font> HB +# %labelgap <space in inches between fill labels> 1.5 +# %xsize <size of graph width in inches> 7 +# %ysize <size of graph height in inches> 6 +# %Title n|s <Bargraph title> none +# %titleplus <increase in points of titlesize> 0 +# %label%d <label name> none +# %boxpercent <100% means columns touch> 75 +# %worse up|down n|w|e|s|nw|ne|sw|se - idiot arrow +# %better up|down n|w|e|s|nw|ne|sw|se - idiot arrow +# %fakemax <pretend one data point was this big> +# +# The data can be optionally followed by a %fill%d that gets turned into +# the fill value (darkness) for that bar of the bar graph. The default +# fill value is whatever pic defaults to. +# The %label control is used to provide a legend for the different fill +# values. +# +# Command line options: +# +# -big make the x/y defaults be 7.5 inches, crank up title size, and +# don't put a spacer at the top. +# -nobox do not put an outline box around the bargraph. +# +# -sideways +# do the bars towards the right. +# +# Much thanks to James Clark for providing such a nice replacement for +# the Unix troff package. + +@lines = <>; # sluuuuuuuuuuuurp +$titleplus = 2; +$bottomplus = 0; +$fill = "fillval"; +$SP = ".sp 1i"; +$PO = "0i"; +# All of these can be set in the graph with %xxx value +$ps = 10; +$ft = "CB"; +$xsize = 4; +$ysize = 6; +$boxpercent = 75; +$labelgap = 1.5; +if ($nobox) { + $invis = "invis"; +} else { + $invis = ""; +} +if ($big) { + $slide = 0; + $xsize = 7.5; + $ysize = 7.5; + $SP = ""; + $titleplus = 4; + $bottomplus = 2; + # XXX - you may need to screw with this. + $xsize -= 3.75 if ($sideways); +} +if ($slide) { + $big = 0; + $xsize = 6.5; + $ysize = 4.20; + $SP = ".sp .75i"; + $PO = ".23i"; + $titleplus = 2; + $bottomplus = 0; + # XXX - you may need to screw with this. + $xsize -= 2.2 if ($sideways); +} + +$vs = $ps + 1; + +# Calculate max to autosize the graph. +foreach $_ (@lines) { + next if /^\s*#/; + next if /^\s*$/; + + if (/^\s*"/) { + ($title = $_) =~ s/\s*"//; + chop($title); + push(@title, "\"\\s+$titleplus$title\\s0\""); + next; + } + if (/^\s*%/) { + &control(0); + push(@control, $_); + next; + } + + @_ = split; + if (!defined $maxdata) { + $maxdata = $_[0]; + } else { + $maxdata = $_[0] if ($maxdata < $_[0]); + } + push(@data, $_); +} + +foreach $_ (@control) { + &control(1); +} + +$n = $#data + 1; +$tps = $ps + $titleplus; +$tvs = int($tps * 1.2); +print <<EOF; +$SP +.po $PO +.ft $ft +.ps $ps +.vs $tvs +.ce 100 +EOF +foreach $_ (@title_n) { + print; +} +# Spit out the pic stuff. +# The idea here is to spit the variables and let pic do most of the math. +# This allows tweeking of the output by hand. +print <<EOF; +.ce 0 +.vs +.PS +.ps $ps +.vs $vs +[ +# Variables, tweek these. + fillval = .12 # default fill value boxes + xsize = $xsize # width of the graph + ysize = $ysize # height of the graph + n = $n + boxpercent = $boxpercent / 100 + gap = xsize / n * (1 - boxpercent) + maxdata = $maxdata + yscale = ysize / maxdata + xscale = xsize / maxdata + +# Draw the graph borders + O: box invis ht ysize wid xsize +EOF +# line thick 2 from O.sw - (0, .1) to O.se - (0, .1) + +#foreach $_ (@control) { +# &control(1); +#} + +# boxwid = xsize / n * boxpercent +if ($sideways) { + print "boxht = ysize / n * boxpercent\n"; + # Each data point. + for ($i = 0; $i <= $#data; $i++) { + $_ = $data[$i]; + @_ = &getfill; + print "box fill $fill wid $_[0] * xscale " . + "with .nw at O.nw - (0, gap /2 + $i * (ysize/n))\n"; + $value = shift(@_); + # XXXXXXX + if ($_[$#_] =~ /secs/) { + #print "\"@_\" ljust at last box.e + .1,0\n"; + $units = pop(@_); + $each = pop(@_); + print "\"\\s+1$value\\s0, @_,\\ \\s+1$each $units\\s0\" ljust at last box.e + .1,0\n"; + } else { + print "\"\\s+2$value\\s0 @_\" ljust at last box.e + .1,0\n"; + } + } +} else { + print "boxwid = xsize / n * boxpercent\n"; + # Each data point. + for ($i = 0; $i <= $#data; $i++) { + $_ = $data[$i]; + @_ = &getfill; + print "box fill $fill ht $_[0] * yscale " . + "with .sw at O.sw + (gap /2 + $i * (xsize/n), 0)\n"; + $value = shift(@_); + @_ = &fmt(@_); + #warn "V=$value\nT=@_\n"; + # Make the bar titles + for ($j = $#_; $j >= 0; $j--) { + print "\t\"$_[$j]\" at last box.n + (0, .05 + .12 * $j)\n"; + } + print "\t\"\\s+$bottomplus$value\\s0\" at last box.s - (0, .30)\n"; + } + +} + +# Labels, if any +if ($#labels > -1) { + print "\n# Labels.\n"; + print "[\n boxwid = .35; boxht = .18; y = .10; x = -.03; "; + print "labelgap = $labelgap\n"; + $first = 1; + foreach $_ (@labels) { + print " [ B: box fill $_[0]; "; + shift(@_); + print "\"@_\" ljust at B.e + (y, x) ]"; + if ($first == 1) { + $first = 0; + print "\n"; + } else { + print " \\\n\twith .w at last [].e + (labelgap, 0)\n"; + } + } + print "] with .nw at O.sw - (0, .6)\n"; +} + +$invis = "invis" if $sideways; + +print <<EOF; +] +box $invis wid last [].wid + .5 ht last [].ht + .5 with .nw at last [].nw + (-.25, .25) +move to last [].nw + 0,.25 +line thick 2 right 7 +move to last [].sw - 0,.25 +line thick 2 right 7 +.PE +.ft +.ps +.vs +.po +EOF + +print <<EOF; +.po .5i +.ft $ft +.ps $ps +.vs $tvs +.sp .5 +.ce 100 +EOF +foreach $_ (@title_s) { + print; +} +print <<EOF; +.po +.ft +.ps +.vs +.ce 0 +EOF +exit 0; + +sub fmt +{ + local(@args); + local(@ret); + + # XXX - this assumes that # is not used anywhere else in the + # label line. + $_ = "@_"; + s/\\ /#/g; + @args = split; + foreach $_ (@args) { + s/#/ /g; + } + $len = 0; + foreach $_ (@args) { + $len = length($_) if (length($_) > $len); + } + $len += 2; + $word = shift(@args); + while ($#args > -1) { + if (length($word) + length($args[0]) < $len) { + $word .= " $args[0]"; + shift(@args); + } else { + push(@ret, $word); + $word = shift(@args); + } + } + push(@ret, $word); + reverse(@ret); +} + +# Eat some control information +# +sub control +{ + local($pass) = $_[0]; + + if ($pass == 0) { + s/.*%//; + chop; + } + @_ = split; + if ($_[0] =~ /[Ww]orse$/ || $_[0] =~ /[Bb]etter$/) { + return if ($pass == 0); + if ($#_ != 2) { + die "bad control: $_\n"; + return; + } + ($label, $dir, $where) = @_; + print "\n# Idiot arrow\n"; + print "[\tarrow thick 10 wid .5 ht .4 $dir 1.15\n"; + print "\t\"\\s+9$label\\s0\" "; + if ($dir eq "up") { + print "at last arrow.s - (0, .25)\n"; + } elsif ($dir eq "down") { + print "at last arrow.n + (0, .25)\n"; + } else { + die "bad control: $_\n"; + } + print "] with .$where at O.$where "; + if ($where eq "n") { + print "- (0, .5)\n"; + } elsif ($where eq "ne") { + print "- (.5, .5)\n"; + } elsif ($where eq "e") { + print "- (.5, 0)\n"; + } elsif ($where eq "se") { + print "- (.5, -.5)\n"; + } elsif ($where eq "s") { + print "+ (0, .5)\n"; + } elsif ($where eq "sw") { + print "+ (.5, .5)\n"; + } elsif ($where eq "w") { + print "+ (.5, 0)\n"; + } elsif ($where eq "nw") { + print "+ (.5, -.5)\n"; + } else { + die "bad control: $_\n"; + } + print "\n"; + } elsif ($_[0] =~ /Title/) { + # XXX - I haven't fixed this for -sideways + return if ($pass == 0); + if ($_[1] eq "n") { + shift(@_); shift(@_); + push(@title_n, "\\s+$titleplus@_\\s0\n"); + } elsif ($_[1] eq "s") { + shift(@_); shift(@_); + push(@title_s, "\\s+$titleplus@_\\s0\n"); + } else { + die "bad control: $_\n"; + } + } elsif ($_[0] =~ /ps/) { + $ps = $_[1]; + } elsif ($_[0] =~ /ft/) { + $ft = $_[1]; + } elsif ($_[0] =~ /xsize/) { + $xsize = $_[1]; + } elsif ($_[0] =~ /ysize/) { + $ysize = $_[1]; + } elsif ($_[0] =~ /titleplus/) { + $titleplus = $_[1]; + } elsif ($_[0] =~ /boxpercent/) { + $boxpercent = $_[1]; + } elsif ($_[0] =~ /labelgap/) { + $labelgap = $_[1]; + } elsif ($_[0] =~ /label/) { # has to be after labelgap + return if ($pass == 0); + $_[0] =~ s/label//; + if (length($_[0]) > 0) { + $fill = $_[0]; + } else { + $fill = "fillval"; + } + push(@labels, "@_"); + } elsif ($_[0] =~ /fakemax/) { + if (!defined $maxdata) { + $maxdata = $_[1]; + } else { + $maxdata = $_[1] if ($maxdata < $_[1]); + } + } else { + die "bad control: $_\n"; + } +} + +# Look for a %fill[val], eat it, and set $fill +sub getfill +{ + local (@line); + + if (/%fill/) { + @_ = split; + foreach $_ (@_) { + if (/%fill/) { + s/%fill//; + if (length($_) > 0) { + $fill = $_; + } else { + $fill = "fillval"; + } + } else { + push(@line, $_); + } + } + } else { + $fill = "fillval"; + @line = split; + } + @line; +} diff --git a/performance/lmbench3/scripts/bghtml b/performance/lmbench3/scripts/bghtml new file mode 100755 index 0000000..5e01f0a --- /dev/null +++ b/performance/lmbench3/scripts/bghtml @@ -0,0 +1,39 @@ + +# Make HTML files that will point to the right GIF files. +# Usage: bghtml file file file.... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1995 Larry McVoy. GPLed software. +# $Id: bghtml 1.2 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ss $0 "$@"' + if 0; + +$bar = 0; +for ($i = 0; $i <= $#ARGV; ++$i) { + $file = $ARGV[$i]; $file =~ s|tmp/||; $file =~ s|.bg$||; + if ($i > 0) { + $prev = $ARGV[$i - 1]; + $prev =~ s|tmp/||; + $prev =~ s|.bg$||; + $prev_html = "${prev}.html"; + } + if ($i < $#ARGV) { + $next = $ARGV[$i + 1]; + $next =~ s|tmp/||; + $next =~ s|.bg$||; + $next_html = "${next}.html"; + } + $name = "HTML/${file}.html"; + open(F, ">$name"); + print F "<a href=${file}.8>Man page for this benchmark</a><p>\n"; + $str = sprintf("<IMG SRC=\"bar%02d\">\n", ++$bar); + print F "$str<p>"; + print F "<a href=lmbench-toc.html><img src=\"gifs/arrows/b_arrow.gif\"</a>\n"; + print F "<a href=lmbench-S-6.html><img src=\"gifs/graph.gif\"</a>\n"; + print F "<a href=${prev_html}><img src=\"gifs/arrows/back.gif\"</a>\n" + if $i > 0; + print F "<a href=${next_html}><img src=\"gifs/arrows/forward.gif\"</a>\n" + if $i < $#ARGV; + close(F); +} +exit 0; diff --git a/performance/lmbench3/scripts/build b/performance/lmbench3/scripts/build new file mode 100755 index 0000000..16a6600 --- /dev/null +++ b/performance/lmbench3/scripts/build @@ -0,0 +1,252 @@ +#!/bin/sh + +CC=${CC-`../scripts/compiler`} +MAKE=${MAKE-`../scripts/make`} +OS=${OS-`../scripts/os`} +TARGET=${TARGET-`../scripts/target`} +BINDIR=../bin/"${OS}" +CONFIG=../bin/"${OS}"/`../scripts/config` +NULL=/dev/null + +BASE=/tmp/dummy +for t in /usr/tmp /var/tmp /tmp; do + if [ -d $t -a -w $t ] + then BASE=${t}/dummy + break + fi +done + +trap 'rm -f ${BASE}$$.s ${BASE}$$.c ${BASE}$$.o ${BASE}$$; exit 1' 1 2 15 + +LDLIBS=-lm + +# check for HP-UX's ANSI compiler +echo "main(int ac, char *av[]) { int i; }" > ${BASE}$$.c +if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} +then + true; +else + rm -f ${BASE}$$ + if ${CC} ${CFLAGS} -Ae -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} + then + CFLAGS="${CFLAGS} -Ae" + fi +fi +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check for IA64 HP-UX w/ HP's ANSI compiler; may need pointer swizzling +arch=`echo $OS | awk -F- '{print $1;}'` +if [ X$CC = "Xcc" -a X$arch = "Xia64" ] +then + echo "#include <stdlib.h>" > ${BASE}$$.c + echo "main(int ac, char *av[])" >> ${BASE}$$.c + echo "{ long* p = (long*)malloc(sizeof(long));" >> ${BASE}$$.c + echo "*p = 0; exit((int)*p); }" >> ${BASE}$$.c + ${CC} ${CFLAGS} +DD64 -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && [ -x ${BASE}$$ ] \ + && ${BASE}$$ \ + && CFLAGS="${CFLAGS} +DD64" + rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c +fi + +# check for bcopy (optionally set the SYS5 flag) +echo "#include <string.h>" > ${BASE}$$.c +echo "main() { char a[256], b[256]; bcopy(a, b, 256); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + || CFLAGS="${CFLAGS} -DSYS5" +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check for valloc +echo "#include <stdlib.h>" > ${BASE}$$.c +echo "main() { char* buf = valloc(123); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + || CFLAGS="${CFLAGS} -Dvalloc=malloc" +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check for getrusage +echo "#include <sys/types.h>" > ${BASE}$$.c +echo "#include <sys/time.h>" >> ${BASE}$$.c +echo "#include <sys/resource.h>" >> ${BASE}$$.c +echo "#ifndef RUSAGE_SELF" >> ${BASE}$$.c +echo "#define RUSAGE_SELF 0" >> ${BASE}$$.c +echo "#endif /* RUSAGE_SELF */" >> ${BASE}$$.c +echo "main() { struct rusage ru; getrusage(RUSAGE_SELF, &ru); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DRUSAGE" +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check for -lnsl +echo "extern int pmap_getport(); main() { pmap_getport(); }" > ${BASE}$$.c +if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then + true; +else + ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c -lnsl 1>${NULL} 2>${NULL} \ + && LDLIBS="${LDLIBS} -lnsl" +fi +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + + +# check for -lsocket +echo "extern void* getservent(); main() { getservent(); }" > ${BASE}$$.c +if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then + true; +else + ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c -lsocket 1>${NULL} 2>${NULL} \ + && LDLIBS="${LDLIBS} -lsocket" +fi +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check for -lrt (solaris) +echo "extern int nanosleep(); main() { nanosleep(); }" >${BASE}$$.c +if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then + true; +else + ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c -lrt 1>${NULL} 2>${NULL} \ + && LDLIBS="${LDLIBS} -lrt" +fi +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check for -lrpc (cygwin/Windows) +echo "extern int pmap_set(); main() { pmap_set(); }" >${BASE}$$.c +if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then + true; +else + ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c -lrpc 1>${NULL} 2>${NULL} \ + && LDLIBS="${LDLIBS} -lrpc" +fi +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check for OSs that have S_IFFIFO instead of S_IFIFO +echo "#include <sys/stat.h>" > ${BASE}$$.c +echo "main() { return (S_IFIFO); }" >> ${BASE}$$.c +if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then + true; +else + rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + echo "#include <sys/stat.h>" > ${BASE}$$.c + echo "main() { return (S_IFFIFO); }" >> ${BASE}$$.c + ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + || CFLAGS="${CFLAGS} -DS_IFIFO=S_IFFIFO" +fi +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check that we have uint +echo "#include <stdlib.h>" > ${BASE}$$.c +echo "#include <sys/types.h>" >> ${BASE}$$.c +echo "main() { uint i = 0; return (i); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DHAVE_uint=1"; +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check that we have uint64 +HAVE_uint64=0 +echo "#include <stdlib.h>" > ${BASE}$$.c +echo "#include <sys/types.h>" >> ${BASE}$$.c +echo "#include <rpc/types.h>" >> ${BASE}$$.c +echo "main() { uint64 i = 0; return (int)(i); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DHAVE_uint64=1" && HAVE_uint64=1; +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check that we have uint64_t +if [ ${HAVE_uint64} = 0 ]; then + echo "#include <stdlib.h>" > ${BASE}$$.c + echo "#include <sys/types.h>" >> ${BASE}$$.c + echo "main() { uint64_t i = 0; return (int)(i); }" >> ${BASE}$$.c + ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DHAVE_uint64_t=1"; + rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c +fi + +# check that we have int64 +HAVE_int64=0 +echo "#include <stdlib.h>" > ${BASE}$$.c +echo "#include <sys/types.h>" >> ${BASE}$$.c +echo "#include <rpc/types.h>" >> ${BASE}$$.c +echo "main() { int64 i = 0; return (int)(i); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DHAVE_int64=1" && HAVE_int64=1; +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check that we have int64_t +if [ ${HAVE_int64} = 0 ]; then + echo "#include <stdlib.h>" > ${BASE}$$.c + echo "#include <sys/types.h>" >> ${BASE}$$.c + echo "main() { int64_t i = 0; return (int)(i); }" >> ${BASE}$$.c + ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DHAVE_int64_t=1"; + rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c +fi + +# check that we have drand48 and srand48 +HAVE_RANDOM=0 +echo "#include <stdlib.h>" > ${BASE}$$.c +echo "main() { srand48(973); return (int)(1.0E9 * drand48()); }" >> ${BASE}$$.c +if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then + CFLAGS="${CFLAGS} -DHAVE_DRAND48" + HAVE_RANDOM=1 +fi +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +if [ ${HAVE_RANDOM} -eq 0 ]; then + echo "#include <stdlib.h>" > ${BASE}$$.c + echo "main() { srand(973); return (10 * rand()) / RAND_MAX; }" >> ${BASE}$$.c + if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then + CFLAGS="${CFLAGS} -DHAVE_RAND" + HAVE_RANDOM=1 + fi + rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c +fi + +if [ ${HAVE_RANDOM} -eq 0 ]; then + echo "#include <stdlib.h>" > ${BASE}$$.c + echo "main() { srandom(973); return (10 * random()) / RAND_MAX; }" >> ${BASE}$$.c + if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then + CFLAGS="${CFLAGS} -DHAVE_RANDOM" + HAVE_RANDOM=1 + fi + rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c +fi + +# check that we have sysmp +echo "#include <sys/types.h>" > ${BASE}$$.c +echo "#include <sys/sysmp.h>" >> ${BASE}$$.c +echo "main() { return (int)sysmp(MP_NPROCS); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DHAVE_SYSMP=1"; +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check that we have bindprocessor +echo "#include <stdlib.h>" > ${BASE}$$.c +echo "#include <unistd.h>" >> ${BASE}$$.c +echo "#include <sys/types.h>" >> ${BASE}$$.c +echo "#include <sys/processor.h>" >> ${BASE}$$.c +echo "main() { return bindprocessor(BINDPROCESS, getpid(), 0); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DHAVE_BINDPROCESSOR=1"; +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check that we have processor_bind +echo "#include <stdlib.h>" > ${BASE}$$.c +echo "#include <sys/types.h>" >> ${BASE}$$.c +echo "#include <sys/processor.h>" >> ${BASE}$$.c +echo "#include <sys/procset.h>" >> ${BASE}$$.c +echo "main() { return processor(P_PID, P_MYPID, 0, NULL); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DHAVE_BINDPROCESSOR=1"; +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + +# check that we have sched_setaffinity +echo "#include <stdlib.h>" > ${BASE}$$.c +echo "#include <unistd.h>" >> ${BASE}$$.c +echo "#include <sched.h>" >> ${BASE}$$.c +echo "main() { unsigned long mask = 1; return sched_setaffinity(0, sizeof(unsigned long), &mask); }" >> ${BASE}$$.c +${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \ + && CFLAGS="${CFLAGS} -DHAVE_SCHED_SETAFFINITY=1"; +rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c + + +if [ ! -d ${BINDIR} ]; then mkdir -p ${BINDIR}; fi + +# now go ahead and build everything! +${MAKE} OS="${OS}" CC="${CC}" CFLAGS="${CFLAGS}" LDLIBS="${LDLIBS}" O="${BINDIR}" $* diff --git a/performance/lmbench3/scripts/compiler b/performance/lmbench3/scripts/compiler new file mode 100755 index 0000000..2fca921 --- /dev/null +++ b/performance/lmbench3/scripts/compiler @@ -0,0 +1,16 @@ +#!/bin/sh + +if [ "X$CC" != "X" ] && echo "$CC" | grep -q '`' +then + CC= +fi + +if [ X$CC = X ] +then CC=cc + for p in `echo $PATH | sed 's/:/ /g'` + do if [ -f $p/gcc ] + then CC=gcc + fi + done +fi +echo $CC diff --git a/performance/lmbench3/scripts/config b/performance/lmbench3/scripts/config new file mode 100755 index 0000000..b58cb60 --- /dev/null +++ b/performance/lmbench3/scripts/config @@ -0,0 +1,7 @@ +#!/bin/sh + +UNAME=`uname -n 2>/dev/null` +if [ X$UNAME = X ] +then echo CONFIG +else echo CONFIG.$UNAME +fi diff --git a/performance/lmbench3/scripts/config-run b/performance/lmbench3/scripts/config-run new file mode 100755 index 0000000..40217d4 --- /dev/null +++ b/performance/lmbench3/scripts/config-run @@ -0,0 +1,783 @@ +#!/bin/sh + +# Configure parameters for lmbench. +# %I% %E% %@% + +OS=`../scripts/os` +L='=====================================================================' +echo $L; +cat<<EOF; + + L M B E N C H C ON F I G U R A T I O N + ---------------------------------------- + +You need to configure some parameters to lmbench. Once you have configured +these parameters, you may do multiple runs by saying + + "make rerun" + +in the src subdirectory. + +NOTICE: please do not have any other activity on the system if you can +help it. Things like the second hand on your xclock or X perfmeters +are not so good when benchmarking. In fact, X is not so good when +benchmarking. + +EOF + +# Figure out echo. +if [ `echo -n "foo" | wc -l` -eq 0 ] +then ECHON="-n"; ECHOC= +else ECHON= ; ECHOC='\c' +fi + +############################################################################ +# Timing granulairty, loop overhead, etc. +############################################################################ +echo $L; echo ""; +echo "Hang on, we are calculating your timing granularity." +../bin/$OS/msleep 250 +ENOUGH=`../bin/$OS/enough` +export ENOUGH +echo "OK, it looks like you can time stuff down to $ENOUGH usec resolution." +echo "" +echo "Hang on, we are calculating your timing overhead." +../bin/$OS/msleep 250 +TIMING_O=`../bin/$OS/timing_o` +export TIMING_O +echo "OK, it looks like your gettimeofday() costs $TIMING_O usecs." +echo "" +echo "Hang on, we are calculating your loop overhead." +../bin/$OS/msleep 250 +LOOP_O=`../bin/$OS/loop_o` +export LOOP_O +echo "OK, it looks like your benchmark loop costs $LOOP_O usecs." +echo "" +############################################################################ +# Multiple copies +############################################################################ +echo $L +cat<<EOF; + +If you are running on an MP machine and you want to try running +multiple copies of lmbench in parallel, you can specify how many here. + +Using this option will make the benchmark run 100x slower (sorry). + +NOTE: WARNING! This feature is experimental and many results are + known to be incorrect or random! + +EOF +AGAIN=Y +while [ $AGAIN = Y ] +do echo $ECHON "MULTIPLE COPIES [default 1] $ECHOC" +# read SYNC_MAX + if [ "X$SYNC_MAX" != X ] + then case "$SYNC_MAX" in + [0-9]|[0-9][0-9]|[0-9][0-9][0-9]) + AGAIN=N + ;; + *) echo "Please enter a number between 1 and 999" + ;; + esac + else AGAIN=N + SYNC_MAX=1 + fi +done + +LMBENCH_SCHED=DEFAULT +AGAIN=Y +while [ $AGAIN = Y ] +do cat<<EOF +Options to control job placement +1) Allow scheduler to place jobs +2) Assign each benchmark process with any attendent child processes + to its own processor +3) Assign each benchmark process with any attendent child processes + to its own processor, except that it will be as far as possible + from other processes +4) Assign each benchmark and attendent processes to their own + processors +5) Assign each benchmark and attendent processes to their own + processors, except that they will be as far as possible from + each other and other processes +6) Custom placement: you assign each benchmark process with attendent + child processes to processors +7) Custom placement: you assign each benchmark and attendent + processes to processors + +Note: some benchmarks, such as bw_pipe, create attendent child +processes for each benchmark process. For example, bw_pipe +needs a second process to send data down the pipe to be read +by the benchmark process. If you have three copies of the +benchmark process running, then you actually have six processes; +three attendent child processes sending data down the pipes and +three benchmark processes reading data and doing the measurements. + +EOF + echo $ECHON "Job placement selection: $ECHOC" +# read LMBENCH_SCHED +LMBENCH_SCHED=1 + AGAIN=N + case "$LMBENCH_SCHED" in + 1) LMBENCH_SCHED=DEFAULT;; + 2) LMBENCH_SCHED=BALANCED;; + 3) LMBENCH_SCHED=BALANCED_SPREAD;; + 4) LMBENCH_SCHED=UNIQUE;; + 5) LMBENCH_SCHED=UNIQUE_SPREAD;; + 6) echo $ECHON "Please enter a space-separated list of CPU ids: $ECHOC" + read LMBENCH_SCHED + LMBENCH_SCHED="CUSTOM $LMBENCH_SCHED" + ;; + 7) echo $ECHON "Please enter a space-separated list of CPU ids: $ECHOC" + read LMBENCH_SCHED + LMBENCH_SCHED="CUSTOM_SPREAD $LMBENCH_SCHED" + ;; + *) AGAIN=Y + ;; + esac +done + +############################################################################ +# Figure out memory size. +############################################################################ +if [ -r /proc/cpuinfo ] +then + PROCESSORS=`grep processor /proc/cpuinfo | wc -l` +fi + +if [ -r /proc/meminfo ] +then + TMP=`grep 'MemTotal:' /proc/meminfo | awk '{print $2}'` + if [ X$TMP != X ] + then MB=`echo $TMP / 1024 | bc 2>/dev/null` + if [ X$MB = X ] + then MB=`expr $TMP / 1024 2>/dev/null` + fi + fi + TMP=`grep 'Mem:' /proc/meminfo | awk '{print $2}'` + if [ X$MB = X -a X$TMP != X ] + then MB=`echo $TMP / 1048576 | bc 2>/dev/null` + if [ X$MB = X ] + then MB=`expr $TMP / 1048576 2>/dev/null` + fi + fi +fi +if [ X$MB = X ] +then $ECHON "Probing system for available memory: $ECHOC" + MB=`../bin/$OS/memsize 4096` +fi +TOTAL_MEM=$MB +MB=`echo \( $MB \* 7 \) / 10 | bc 2>/dev/null` +if [ X$MB = X ] +then MB=`expr $TOTAL_MEM \* 7` + MB=`expr $MB / 10` +fi + +echo $L +cat<<EOF; + +Several benchmarks operate on a range of memory. This memory should be +sized such that it is at least 4 times as big as the external cache[s] +on your system. It should be no more than 80% of your physical memory. + +The bigger the range, the more accurate the results, but larger sizes +take somewhat longer to run the benchmark. + +EOF +echo $ECHON "MB [default $MB] $ECHOC" +#read TMP +if [ X$TMP != X ] +then MB=$TMP +fi +# Certain machines tend to barf when you try and bcopy 8MB. +# Figure out how much we can use. +echo "Checking to see if you have $MB MB; please wait for a moment..." +MB=`../bin/$OS/memsize $MB` +MB=`../bin/$OS/memsize $MB` +MB=`../bin/$OS/memsize $MB` +if [ `expr $SYNC_MAX \* $MB` -gt `expr $TOTAL_MEM` ] +then + MB=`expr $TOTAL_MEM / $SYNC_MAX` + MB=`expr $MB / 2` +fi +if [ $MB -lt 8 ] +then echo $0 aborted: Not enough memory, only ${MB}MB available. + exit 1 +fi +if [ $MB -lt 16 ] +then echo Warning: you have only ${MB}MB available memory. + echo Some benchmark results will be less meaningful. +fi + +echo "Hang on, we are calculating your cache line size." +../bin/$OS/msleep 250 +LINE_SIZE=`../bin/$OS/line -M ${MB}M` +export LINE_SIZE +echo "OK, it looks like your cache line is $LINE_SIZE bytes." +echo "" + +############################################################################ +# Benchmarking subsets +############################################################################ +echo $L +cat<<EOF; + +lmbench measures a wide variety of system performance, and the full suite +of benchmarks can take a long time on some platforms. Consequently, we +offer the capability to run only predefined subsets of benchmarks, one +for operating system specific benchmarks and one for hardware specific +benchmarks. We also offer the option of running only selected benchmarks +which is useful during operating system development. + +Please remember that if you intend to publish the results you either need +to do a full run or one of the predefined OS or hardware subsets. + +EOF + +echo $ECHON "SUBSET (ALL|HARWARE|OS|DEVELOPMENT) [default all] $ECHOC" +#read subset +subset=O +BENCHMARK_HARDWARE=NO +BENCHMARK_OS=NO +BENCHMARK_DEVELOPMENT=NO +case "$subset" in + [hH]*) BENCHMARK_HARDWARE=YES;; + [oO]*) BENCHMARK_OS=YES;; + [dD]*) BENCHMARK_DEVELOPMENT=YES;; + *) BENCHMARK_HARDWARE=YES; + BENCHMARK_OS=YES;; +esac + +if [ X$BENCHMARK_DEVELOPMENT = XYES ]; then + echo $L + + echo $ECHON "SYSCALL [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_SYSCALL=NO;; + *) BENCHMARK_SYSCALL=YES;; + esac + + echo $ECHON "SELECT [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_SELECT=NO;; + *) BENCHMARK_SELECT=YES;; + esac + + echo $ECHON "PROCESS CREATION [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_PROC=NO;; + *) BENCHMARK_PROC=YES;; + esac + + echo $ECHON "PAGEFAULT [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_PAGEFAULT=NO;; + *) BENCHMARK_PAGEFAULT=YES;; + esac + + echo $ECHON "FILE [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_FILE=NO;; + *) BENCHMARK_FILE=YES;; + esac + + echo $ECHON "MMAP [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_MMAP=NO;; + *) BENCHMARK_MMAP=YES;; + esac + + echo $ECHON "CONTEXT SWITCH [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_CTX=NO;; + *) BENCHMARK_CTX=YES;; + esac + + echo $ECHON "PIPE [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_PIPE=NO;; + *) BENCHMARK_PIPE=YES;; + esac + + echo $ECHON "UNIX socket [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_UNIX=NO;; + *) BENCHMARK_UNIX=YES;; + esac + + echo $ECHON "UDP [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_UDP=NO;; + *) BENCHMARK_UDP=YES;; + esac + + echo $ECHON "TCP [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_TCP=NO;; + *) BENCHMARK_TCP=YES;; + esac + + echo $ECHON "TCP CONNECT [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_CONNECT=NO;; + *) BENCHMARK_CONNECT=YES;; + esac + + echo $ECHON "RPC [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_RPC=NO;; + *) BENCHMARK_RPC=YES;; + esac + + echo $ECHON "HTTP [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_HTTP=NO;; + *) BENCHMARK_HTTP=YES;; + esac + + echo $ECHON "BCOPY [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_BCOPY=NO;; + *) BENCHMARK_BCOPY=YES;; + esac + + echo $ECHON "MEMORY HIERARCHY [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_MEM=NO;; + *) BENCHMARK_MEM=YES;; + esac + + echo $ECHON "CPU OPERATIONS [default yes] $ECHOC" + read bench + case "$bench" in + [nN]*) BENCHMARK_OPS=NO;; + *) BENCHMARK_OPS=YES;; + esac +fi + +############################################################################ +# Memory strides for lat_mem +############################################################################ +FASTMEM=NO +if [ "$BENCHMARK_HARDWARE" = "YES" ]; then + echo $L + cat<<EOF; + +This benchmark measures, by default, memory latency for a number of +different strides. That can take a long time and is most useful if you +are trying to figure out your cache line size or if your cache line size +is greater than 128 bytes. + +If you are planning on sending in these results, please don't do a fast +run. + +Answering yes means that we measure memory latency with a 128 byte stride. + +EOF + + echo $ECHON "FASTMEM [default no] $ECHOC" + read fast + case "$fast" in + [yY]*) FASTMEM=YES;; + *) FASTMEM=NO;; + esac +fi + +############################################################################ +# File system latency +############################################################################ +echo $L +cat<<EOF; + +This benchmark measures, by default, file system latency. That can +take a long time on systems with old style file systems (i.e., UFS, +FFS, etc.). Linux' ext2fs and Sun's tmpfs are fast enough that this +test is not painful. + +If you are planning on sending in these results, please don't do a fast +run. + +If you want to skip the file system latency tests, answer "yes" below. + +EOF + +echo $ECHON "SLOWFS [default no] $ECHOC" +#read slow +slow=n +case "$slow" in + [yY]*) SLOWFS=YES;; + *) SLOWFS=NO;; +esac + +############################################################################ +# Disk bandwidth/seek times +############################################################################ +if [ $SYNC_MAX -gt 1 -o "${BENCHMARK_HARDWARE}" != "YES" ]; then + # parallel benchmarking is incompatible with disk tests + DISK_DESC="" + DISKS="" +else + echo $L + cat<<EOF; + +This benchmark can measure disk zone bandwidths and seek times. These can +be turned into whizzy graphs that pretty much tell you everything you might +need to know about the performance of your disk. + +This takes a while and requires read access to a disk drive. +Write is not measured, see disk.c to see how if you want to do so. + +If you want to skip the disk tests, hit return below. + +If you want to include disk tests, then specify the path to the disk +device, such as /dev/sda. For each disk that is readable, you'll be +prompted for a one line description of the drive, i.e., + + Iomega IDE ZIP +or + HP C3725S 2GB on 10MB/sec NCR SCSI bus + +EOF + + echo $ECHON "DISKS [default none] $ECHOC" + read disks + if [ X"$disks" != X ] + then + for i in $disks + do if [ -r $i ] + then ../bin/$OS/flushdisk $i + if [ $? -eq 1 ] + then echo "Must be root to run disk benchmarks." + echo "Root is needed to flush the buffer cache" + exit 1 + fi + echo $ECHON "$i is a $ECHOC" + read x + DISK_DESC="$DISK_DESC[${i}:${x}] " + DISKS="$DISKS${i} " + else echo "Can't read $i, skipping it." + fi + done + fi +fi + +############################################################################ +# Remote networking +############################################################################ +if [ $SYNC_MAX -gt 1 ]; then + # remote networking is incompatible with parallel benchmarking + REMOTE="" +else + echo $L + + RSH=rsh + for p in `echo $PATH | sed 's/:/ /g'` + do if [ -f $p/remsh ] + then RSH=remsh + fi + done + RCP=rcp + + cat<<EOF; + +If you are running on an idle network and there are other, identically +configured systems, on the same wire (no gateway between you and them), +and you have rsh access to them, then you should run the network part +of the benchmarks to them. Please specify any such systems as a space +separated list such as: ether-host fddi-host hippi-host. + +EOF + echo $ECHON "REMOTE [default none] $ECHOC" +# read REMOTE + if [ "X$REMOTE" != X ] + then cat<<EOF; + +Thanks for doing remote testing, that is a hard thing to get. In +order to run a server on the remote system, we need a remote shell +to be enabled (ideally without a password) from this host to $REMOTE. +The original remote shell is rsh, but the use of a secure remote shell +like ssh is increasingly common. We need the name of BOTH the shell +itself and the associated copy tool (e.g. rcp vs scp) to be entered. + +EOF + echo $ECHON "RSH [default $RSH] $ECHOC" + read rsh + if [ -n "$rsh" ] + then RSH=$rsh + fi + echo $ECHON "RCP [default $RCP] $ECHOC" + read rcp + if [ -n "$rsh" ] + then RCP=$rcp + fi + + cat<<EOF; + +Could you do me one more favor and tell me the networking you think +will be used to get to each of the remote hosts. By networking I +mean one of the following (or whatever you use if you use something +else): + +ethernet aka 10baseT, thinnet, thicknet, etc +ethernet-100 aka 100baseT, 100VG +fddi aka cddi +hippi +others? + +Please type it just like the above if you can, it makes parsing easier. + +EOF + + + for r in $REMOTE + do echo $ECHON "Network type for $r: $ECHOC" + read n + X=`$RSH $r echo foo` + if [ X$X = Xfoo ] + then echo Remote access to $r worked, thanks. + else echo Remote access to $r did not work, please check and retry, + exit 1 + fi + NETWORKS="${NETWORKS}[ $r:$n ]" + done + fi +fi + +############################################################################ +# Processor speed +############################################################################ +echo $L +echo "" +echo "Calculating mhz, please wait for a moment..." +MHZ=`../bin/$OS/mhz` +cat<<EOF +I think your CPU mhz is + + $MHZ + +but I am frequently wrong. If that is the wrong Mhz, type in your +best guess as to your processor speed. It doesn't have to be exact, +but if you know it is around 800, say 800. + +Please note that some processors, such as the P4, have a core which +is double-clocked, so on those processors the reported clock speed +will be roughly double the advertised clock rate. For example, a +1.8GHz P4 may be reported as a 3592MHz processor. + +EOF +echo $ECHON "Processor mhz [default $MHZ] $ECHOC" +#read mhz +if [ -n "$mhz" ] +then MHZ=$mhz +fi + + +############################################################################ +# /usr/tmp? +############################################################################ +echo $L +AGAIN=Y +while [ $AGAIN = Y ] +do + cat<<EOF; + +We need a place to store a $MB Mbyte file as well as create and delete a +large number of small files. We default to /usr/tmp. If /usr/tmp is a +memory resident file system (i.e., tmpfs), pick a different place. +Please specify a directory that has enough space and is a local file +system. + +EOF + DEFAULTFSDIR=/usr/tmp + for t in /usr/tmp /var/tmp /tmp; do + if [ -d $t -a -w $t ] + then DEFAULTFSDIR=$t + break + fi + done + echo $ECHON "FSDIR [default $DEFAULTFSDIR] $ECHOC" + #read FSDIR + if [ X$FSDIR = X ] + then FSDIR=$DEFAULTFSDIR + else mkdir -p $FSDIR 2>/dev/null + fi + if [ -d $FSDIR -a -w $FSDIR ] + then AGAIN=N + FILE=$FSDIR/XXX + else echo $FSDIR is not a directory or is not writable + fi +done + +############################################################################ +# status output? +############################################################################ +echo $L +cat<<EOF; + +lmbench outputs status information as it runs various benchmarks. +By default this output is sent to /dev/tty, but you may redirect +it to any file you wish (such as /dev/null...). + +EOF + +echo $ECHON "Status output file [default /dev/tty] $ECHOC" +#read OUTPUT +if [ "X$OUTPUT" = X ] +then OUTPUT=/dev/tty; +fi + +############################################################################ +# Submit results? +############################################################################ +echo $L +cat<<EOF; + +There is a database of benchmark results that is shipped with new +releases of lmbench. Your results can be included in the database +if you wish. The more results the better, especially if they include +remote networking. If your results are interesting, i.e., for a new +fast box, they may be made available on the lmbench web page, which is + + http://www.bitmover.com/lmbench + +EOF + +echo $ECHON "Mail results [default yes] $ECHOC" +#read MAIL +MAIL=n +case $MAIL in + [Nn]*) MAIL=no + echo OK, no results mailed. + ;; + *) MAIL=yes + ;; +esac + +INFO=`../scripts/info` +if [ $MAIL = yes ] +then if [ ! -f ../bin/$OS/$INFO ] + then cp ../scripts/info-template ../bin/$OS/$INFO + chmod +w ../bin/$OS/$INFO + REUSE=no + else + REUSE=view + while [ $REUSE = view ] + do echo "" + echo $ECHON \ +"Reuse previous description [default yes, other options: no|view] $ECHOC" + read REUSE + case $REUSE in + [Nn]*) REUSE=no + ;; + [Vv]*) REUSE=view + echo $L + more ../bin/$OS/$INFO + echo $L + ;; + *) REUSE=yes + ;; + esac + done + fi + + if [ $REUSE = no ] + then EDITOR=vi + echo $L + cat<<EOF; + +Please tell us about your machine. There is a form we would like you +to fill out that we will make available with the results. If you would +prefer to use a different editor, tell us the editor at the prompt. + +If you want to skip filling out this form (please don't) then answer +"none" at the prompt. + +EOF + echo $ECHON "Editor [default $EDITOR] $ECHOC" + read TMP + if [ X$TMP != X ] + then EDITOR=$TMP + fi + if [ X$EDITOR != "none" ] + then $EDITOR ../bin/$OS/`../scripts/info` + fi + fi +fi + +echo $L +echo "" +echo "Confguration done, thanks." +cat <<EOF + +There is a mailing list for discussing lmbench hosted at BitMover. +Send mail to majordomo@xxxxxxxxxxxx to join the list. + +EOF + +VERSION=`../scripts/version` + +C=../bin/$OS/`../scripts/config` +echo DISKS=\"$DISKS\" > $C +echo DISK_DESC=\"$DISK_DESC\" >> $C +echo OUTPUT=$OUTPUT >> $C +echo ENOUGH=$ENOUGH >> $C +echo FASTMEM=\"$FASTMEM\" >> $C +echo FILE=$FILE >> $C +echo FSDIR=$FSDIR >> $C +echo INFO=$INFO >> $C +echo LINE_SIZE=$LINE_SIZE >> $C +echo LOOP_O=$LOOP_O >> $C +echo MAIL=$MAIL >> $C +echo TOTAL_MEM=$TOTAL_MEM >> $C +echo MB=$MB >> $C +echo MHZ=\"$MHZ\" >> $C +echo MOTHERBOARD=\"$MOTHERBOARD\" >> $C +echo NETWORKS=\"$NETWORKS\" >> $C +echo OS=\"$OS\" >> $C +echo PROCESSORS=\"$PROCESSORS\" >> $C +echo REMOTE=\"$REMOTE\" >> $C +echo SLOWFS=\"$SLOWFS\" >> $C +echo SYNC_MAX=\"$SYNC_MAX\" >> $C +echo LMBENCH_SCHED=\"$LMBENCH_SCHED\" >> $C +echo TIMING_O=$TIMING_O >> $C +echo RSH=$RSH >> $C +echo RCP=$RCP >> $C +echo VERSION=$VERSION >> $C +echo BENCHMARK_HARDWARE=$BENCHMARK_HARDWARE >> $C +echo BENCHMARK_OS=$BENCHMARK_OS >> $C +echo BENCHMARK_SYSCALL=$BENCHMARK_SYSCALL >> $C +echo BENCHMARK_SELECT=$BENCHMARK_SELECT >> $C +echo BENCHMARK_PROC=$BENCHMARK_PROC >> $C +echo BENCHMARK_CTX=$BENCHMARK_CTX >> $C +echo BENCHMARK_PAGEFAULT=$BENCHMARK_PAGEFAULT >> $C +echo BENCHMARK_FILE=$BENCHMARK_FILE >> $C +echo BENCHMARK_MMAP=$BENCHMARK_MMAP >> $C +echo BENCHMARK_PIPE=$BENCHMARK_PIPE >> $C +echo BENCHMARK_UNIX=$BENCHMARK_UNIX >> $C +echo BENCHMARK_UDP=$BENCHMARK_UDP >> $C +echo BENCHMARK_TCP=$BENCHMARK_TCP >> $C +echo BENCHMARK_CONNECT=$BENCHMARK_CONNECT >> $C +echo BENCHMARK_RPC=$BENCHMARK_RPC >> $C +echo BENCHMARK_HTTP=$BENCHMARK_HTTP >> $C +echo BENCHMARK_BCOPY=$BENCHMARK_BCOPY >> $C +echo BENCHMARK_MEM=$BENCHMARK_MEM >> $C +echo BENCHMARK_OPS=$BENCHMARK_OPS >> $C + +exit 0 diff --git a/performance/lmbench3/scripts/config-scaling b/performance/lmbench3/scripts/config-scaling new file mode 100755 index 0000000..12e0f02 --- /dev/null +++ b/performance/lmbench3/scripts/config-scaling @@ -0,0 +1,160 @@ +#!/bin/sh + +# config-scaling - reconfigure just the scaling parameter SYNC_MAX +# +# Hacked by Carl Staelin (staelin@xxxxxxxxxx). +# Copyright (c) 2002 Carl Staelin. GPLed software. +# $Id$ + +# Make sure we can find: ./cmd, df, and netstat +PATH=.:../../scripts:$PATH:/etc:/usr/etc:/sbin:/usr/sbin +export PATH + +if [ ! -f $1 ]; then exit 1; fi + +. $1 +echo Using config in $1 + +OLD_SYNC_MAX=$SYNC_MAX + +############################################################################ +# Multiple copies +############################################################################ +echo $L +cat<<EOF; + +If you are running on an MP machine and you want to try running +multiple copies of lmbench in parallel, you can specify how many here. + +Using this option will make the benchmark run 100x slower (sorry). + +NOTE: WARNING! This feature is experimental and many results are + known to be incorrect or random! + +EOF +AGAIN=Y +while [ $AGAIN = Y ] +do echo $ECHON "MULTIPLE COPIES [default 1] $ECHOC" + read SYNC_MAX + if [ "X$SYNC_MAX" != X ] + then case "$SYNC_MAX" in + [0-9]|[0-9][0-9]|[0-9][0-9][0-9]) + AGAIN=N + ;; + *) echo "Please enter a number between 1 and 999" + ;; + esac + else AGAIN=N + SYNC_MAX=1 + fi +done + +if [ "X$LMBENCH_SCHED" = "X" ] +then + LMBENCH_SCHED=DEFAULT + AGAIN=Y + while [ "$AGAIN" = "Y" ] + do cat<<EOF +Options to control job placement +1) Allow scheduler to place jobs +2) Assign each benchmark process with any attendent child processes + to its own processor +3) Assign each benchmark process with any attendent child processes + to its own processor, except that it will be as far as possible + from other processes +4) Assign each benchmark and attendent processes to their own + processors +5) Assign each benchmark and attendent processes to their own + processors, except that they will be as far as possible from + each other and other processes +6) Custom placement: you assign each benchmark process with attendent + child processes to processors +7) Custom placement: you assign each benchmark and attendent + processes to processors + +Note: some benchmarks, such as bw_pipe, create attendent child +processes for each benchmark process. For example, bw_pipe +needs a second process to send data down the pipe to be read +by the benchmark process. If you have three copies of the +benchmark process running, then you actually have six processes; +three attendent child processes sending data down the pipes and +three benchmark processes reading data and doing the measurements. + +EOF + echo $ECHON "Job placement selection: $ECHOC" + read LMBENCH_SCHED + AGAIN=N + case "$LMBENCH_SCHED" in + 1) LMBENCH_SCHED=DEFAULT;; + 2) LMBENCH_SCHED=BALANCED;; + 3) LMBENCH_SCHED=BALANCED_SPREAD;; + 4) LMBENCH_SCHED=UNIQUE;; + 5) LMBENCH_SCHED=UNIQUE_SPREAD;; + 6) echo $ECHON "Please enter a space-separated list of CPU ids: $ECHOC" + read LMBENCH_SCHED + LMBENCH_SCHED="CUSTOM $LMBENCH_SCHED" + ;; + 7) echo $ECHON "Please enter a space-separated list of CPU ids: $ECHOC" + read LMBENCH_SCHED + LMBENCH_SCHED="CUSTOM_SPREAD $LMBENCH_SCHED" + ;; + *) AGAIN=Y + ;; + esac + done +fi + +if [ `expr $SYNC_MAX \* $MB` -gt `expr $TOTAL_MEM / 2` ] +then + MB=`expr $TOTAL_MEM / $SYNC_MAX` + MB=`expr $MB / 2` +fi + +C=$1 +echo DISKS=\"$DISKS\" > $C +echo DISK_DESC=\"$DISK_DESC\" >> $C +echo OUTPUT=$OUTPUT >> $C +echo ENOUGH=$ENOUGH >> $C +echo FASTMEM=\"$FASTMEM\" >> $C +echo FILE=$FILE >> $C +echo FSDIR=$FSDIR >> $C +echo INFO=$INFO >> $C +echo LINE_SIZE=$LINE_SIZE >> $C +echo LOOP_O=$LOOP_O >> $C +echo MAIL=$MAIL >> $C +echo TOTAL_MEM=$TOTAL_MEM >> $C +echo MB=$MB >> $C +echo MHZ=\"$MHZ\" >> $C +echo MOTHERBOARD=\"$MOTHERBOARD\" >> $C +echo NETWORKS=\"$NETWORKS\" >> $C +echo OS=\"$OS\" >> $C +echo PROCESSORS=\"$PROCESSORS\" >> $C +echo REMOTE=\"$REMOTE\" >> $C +echo SLOWFS=\"$SLOWFS\" >> $C +echo SYNC_MAX=\"$SYNC_MAX\" >> $C +echo LMBENCH_SCHED=\"$LMBENCH_SCHED\" >> $C +echo TIMING_O=$TIMING_O >> $C +echo RSH=$RSH >> $C +echo RCP=$RCP >> $C +echo VERSION=$VERSION >> $C +echo BENCHMARK_HARDWARE=$BENCHMARK_HARDWARE >> $C +echo BENCHMARK_OS=$BENCHMARK_OS >> $C +echo BENCHMARK_SYSCALL=$BENCHMARK_SYSCALL >> $C +echo BENCHMARK_SELECT=$BENCHMARK_SELECT >> $C +echo BENCHMARK_PROC=$BENCHMARK_PROC >> $C +echo BENCHMARK_CTX=$BENCHMARK_CTX >> $C +echo BENCHMARK_PAGEFAULT=$BENCHMARK_PAGEFAULT >> $C +echo BENCHMARK_FILE=$BENCHMARK_FILE >> $C +echo BENCHMARK_MMAP=$BENCHMARK_MMAP >> $C +echo BENCHMARK_PIPE=$BENCHMARK_PIPE >> $C +echo BENCHMARK_UNIX=$BENCHMARK_UNIX >> $C +echo BENCHMARK_UDP=$BENCHMARK_UDP >> $C +echo BENCHMARK_TCP=$BENCHMARK_TCP >> $C +echo BENCHMARK_CONNECT=$BENCHMARK_CONNECT >> $C +echo BENCHMARK_RPC=$BENCHMARK_RPC >> $C +echo BENCHMARK_HTTP=$BENCHMARK_HTTP >> $C +echo BENCHMARK_BCOPY=$BENCHMARK_BCOPY >> $C +echo BENCHMARK_MEM=$BENCHMARK_MEM >> $C +echo BENCHMARK_OPS=$BENCHMARK_OPS >> $C + +exit 0 diff --git a/performance/lmbench3/scripts/depend b/performance/lmbench3/scripts/depend new file mode 100755 index 0000000..30452ec --- /dev/null +++ b/performance/lmbench3/scripts/depend @@ -0,0 +1,28 @@ + +# Figure out dependencies for lmbench src. +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: depend 1.4 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +open(M, "Makefile"); +while(<M>) { + push(@Makefile, $_); + last if /^..MAKEDEPEND/; +} +close(M); +open(G, "gcc -MM *.c | grep -v mhz.c | grep -v lat_ctx.c|"); +while (<G>) { + chop; + split(/:/); + $_[0] =~ s/\.o\s*$//; + push(@Makefile, "\$O/$_[0]: $_[1] \$O/lmbench.a\n"); + push(@Makefile, "\t\$(COMPILE) -o \$O/$_[0] $_[0].c \$O/lmbench.a \$(LDLIBS)\n\n"); +} +system "mv Makefile Makefile.old"; +open(M, ">Makefile"); +print M @Makefile; +close(M); +exit 0; diff --git a/performance/lmbench3/scripts/do_ctx b/performance/lmbench3/scripts/do_ctx new file mode 100755 index 0000000..002a6c2 --- /dev/null +++ b/performance/lmbench3/scripts/do_ctx @@ -0,0 +1,35 @@ +#!/bin/sh + +# Make sure we can find: ./cmd, df, and netstat +PATH=.:$PATH:/etc:/usr/etc:/sbin:/usr/sbin +export PATH + +if [ X$MB = X ] +then MB=8 +fi +AVAILKB=`expr $MB \* 1024` + +# Figure out how big we can go for stuff that wants to use +# all and half of memory. +HALF="512 1k 2k 4k 8k 16k 32k 64k 128k 256k 512k 1m" +ALL="$HALF 2m" +i=4 +while [ $i -le $MB ] +do + ALL="$ALL ${i}m" + h=`expr $i / 2` + HALF="$HALF ${h}m" + i=`expr $i \* 2` +done + +msleep 250 +if [ "X$CTX" = X ] +then CTX="0 4 8 16 32 64" +fi +if [ "X$N" = X ] +then N="2 4 8 16 24 32 64 96" +fi +for size in $CTX +do lat_ctx -s $size $N +done +exit 0 diff --git a/performance/lmbench3/scripts/getbg b/performance/lmbench3/scripts/getbg new file mode 100755 index 0000000..24e49ad --- /dev/null +++ b/performance/lmbench3/scripts/getbg @@ -0,0 +1,806 @@ + +# Extract bargraph data from lmbench results. +# Usage: getbg file file file.... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: getbg 1.18 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Sws $0 "$@"' + if 0; + +@bw_file = @file = @lat_ctx32_8 = @lat_ctx32 = @lat_ctx8 = @lat_ctx = +@lat_shproc = @lat_simpleproc = @lat_nullproc = +@lat_rpc_tcp_local = @lat_rpc_udp_local = @lat_tcp_local = @lat_udp_local = +@lat_pipe = @lat_disk = @mhz = @lat_fs_delete = @lat_fs_create = +@lat_mappings = @lat_pagefault = @lat_connect = @lat_signal = @lat_sigaction = +@lat_nullsys = @lat_mem = @lat_l2 = @lat_l1 = (); +$nosort = $v = $paper = $slide = 0 if 0; +$sortN = 0; +$n = 0; +foreach $file (@ARGV) { + warn "$0: doing $file\n" if $v; + open(FD, $file) || die "$0: can't open $file"; + $file =~ s|/|-|; + $file =~ s/\.\d+//; + push(@file, $file); + while (<FD>) { + chop; + next if m|scripts/lmbench: /dev/tty|; + if (/^\[lmbench/) { + @_ = split; + if ($_[3] eq "SunOS") { + $_[3] .= "-$_[5]"; + } + push(@uname, "@_"); + } + if (/Mhz/) { + @_ = split; + push(@misc_mhz, $_[0]); + } + if (/^Null syscall:/) { + @_ = split; + push(@lat_nullsys, $_[2]); + } + if (/^Signal handler installation:/) { + @_ = split; + push(@lat_sigaction, $_[3]); + } + if (/^Signal handler overhead:/) { + @_ = split; + push(@lat_signal, $_[3]); + } + if (/^Pipe latency:/) { + @_ = split; + push(@lat_pipe, $_[2]); + } + if (/UDP latency using localhost:/) { + @_ = split; + push(@lat_udp_local, $_[4]); + } + if (/TCP latency using localhost/) { + @_ = split; + push(@lat_tcp_local, $_[4]); + } + if (/RPC.udp latency using localhost/) { + @_ = split; + push(@lat_rpc_udp_local, $_[4]); + } + if (/RPC.tcp latency using localhost/) { + @_ = split; + push(@lat_rpc_tcp_local, $_[4]); + } + if (/TCP\/IP connection cost to localhost/) { + @_ = split; + push(@lat_connect, $_[5]); + } + if (/^Process fork.exit/) { + @_ = split; + push(@lat_nullproc, $_[2]); + } + if (/^Process fork.execve:/) { + @_ = split; + push(@lat_simpleproc, $_[2]); + } + if (/^Process fork..bin.sh/) { + @_ = split; + push(@lat_shproc, $_[3]); + } + if (/^Pagefaults on/) { + @_ = split; + push(@lat_pagefault, $_[3]); + } + if (/size=0 ovr=/) { + while (<FD>) { + # Make sure we break out if no data here. + if (!/^[1-9]+\s/) { + warn "$file: No ctx found\n"; + push(@lat_ctx, -1); + } + next unless /^2/; + @_ = split; + push(@lat_ctx, $_[1]); + last; + } + while (<FD>) { + # Make sure we break out if no data here. + if (!/^[1-9]+\s/) { + warn "$file: No ctx found\n"; + push(@lat_ctx, -1); + } + next unless /^8/; + @_ = split; + push(@lat_ctx8, $_[1]); + last; + } + } + if (/size=32 ovr=/) { + while (<FD>) { + # Make sure we break out if no data here. + if (!/^[1-9]+\s/) { + warn "$file: No ctx found\n"; + push(@lat_ctx32, -1); + } + next unless /^2/; + @_ = split; + push(@lat_ctx32, $_[1]); + last; + } + while (<FD>) { + # Make sure we break out if no data here. + if (!/^[1-9]+\s/) { + warn "$file: No ctx found\n"; + push(@lat_ctx32_8, -1); + } + next unless /^8/; + @_ = split; + push(@lat_ctx32_8, $_[1]); + last; + } + } + if (/^Pipe bandwidth/) { + @_ = split; + push(@bw_pipe, $_[2]); + } + if (/^Socket bandwidth using localhost/) { + @_ = split; + push(@bw_tcp_local, $_[4]); + } + if (/^Disk .* latency:/) { + @_ = split; + push(@lat_disk, $_[3]); + } + if (/^File .* write bandwidth/) { + @_ = split; + $bw = sprintf("%.2f", $_[4] / 1024.); + push(@bw_file, $bw); + } + if (/^"mappings/) { + $value = &getbiggest("memory mapping timing"); + push(@lat_mappings, $value); + } + if (/^"read bandwidth/) { + $value = &getbiggest("reread timing"); + push(@bw_reread, $value); + } + if (/^"Mmap read bandwidth/) { + $value = &getbiggest("mmap reread timing"); + push(@bw_mmap, $value); + } + if (/^"libc bcopy unaligned/) { + $value = &getbiggest("libc bcopy timing"); + push(@bw_bcopy_libc, $value); + } + if (/^"unrolled bcopy unaligned/) { + $value = &getbiggest("unrolled bcopy timing"); + push(@bw_bcopy_unrolled, $value); + } + if (/^Memory read/) { + $value = &getbiggest("memory read & sum timing"); + push(@bw_mem_rdsum, $value); + } + if (/^Memory write/) { + $value = &getbiggest("memory write timing"); + push(@bw_mem_wr, $value); + } + if (/^0k\s/) { + @_ = split; + push(@lat_fs_create, int(1000000/$_[2])); + push(@lat_fs_delete, int(1000000/$_[3])); + } + if (/^"stride=128/) { + $save = -1; + while (<FD>) { + if (/^0.00098\s/) { + @_ = split; + push(@lat_l1, $_[1]); + } elsif (/^0.12500\s/) { + @_ = split; + push(@lat_l2, $_[1]); + } elsif (/^[45678].00000\s/) { + @_ = split; + $size = $_[0]; + $save = $_[1]; + last if /^8.00000\s/; + } elsif (/^\s*$/) { + last; + } + } + if (!/^8/) { + warn "$file: No 8MB memory latency, using $size\n"; + } + push(@lat_mem, $save); + } + } + foreach $array ( + 'misc_mhz', 'lat_nullsys', 'lat_pipe', 'lat_udp_local', + 'lat_tcp_local', 'lat_rpc_udp_local', 'lat_connect', + 'lat_rpc_tcp_local', 'lat_nullproc', 'lat_simpleproc', + 'lat_ctx', 'lat_ctx8', 'bw_pipe', 'bw_tcp_local', + 'bw_file', 'lat_mappings', 'bw_reread', 'bw_mmap', + 'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_mem_rdsum', + 'bw_mem_wr', 'lat_l1', 'lat_l2', 'lat_mem', 'lat_disk', + ) { + $last = eval '$#' . $array; + if ($last != $n) { + warn "No data for $array in $file\n"; + eval 'push(@' . $array . ', -1);'; + } + } + $n++; +} + +if ($paper) { + &tbl("lat_nullsys", "usecs", "system call"); + &tbl2("lat_signal", "lat_sigaction", "lat_signal", "usecs", + "signal", "sigaction", "sig handler"); + #&tbl("lat_nullproc", "msecs", "Process fork/exit time in milliseconds"); + #&tbl("lat_simpleproc", "msecs", "Simple process create time in milliseconds"); + #&tbl("lat_shproc", "msecs", "Process creates via /bin/sh time in milliseconds"); + #&tbl2("lat_proc", "lat_simpleproc", "lat_shproc", "usecs", + # "Process create time in milliseconds", "exec(2)", "/bin/sh -c"); + &procs("lat_allproc", "lat_nullproc", "lat_simpleproc", "lat_shproc", + "msecs"); + &ctx; + &tbl("lat_pipe", "usecs", "Pipe latency"); + &tbl("lat_connect", "usecs", "TCP connection"); + &tbl2("lat_udp", "lat_udp_local", "lat_rpc_udp_local", "usecs", + "UDP latency in \\(*mseconds", "UDP", "RPC/UDP"); + &tbl2("lat_tcp", "lat_tcp_local", "lat_rpc_tcp_local", "usecs", + "TCP latency in \\(*mseconds", "TCP", "RPC/TCP"); + &tbl("lat_mappings", "usecs", "Memory mapping latency in \\(*mseconds"); + &tbl("lat_pagefault", "usecs", "Pagefault latency in \\(*mseconds"); + &tbl2("lat_fs", "lat_fs_create", "lat_fs_delete", "usecs", + "File latency in milliseconds", "Create", "Delete"); + &tbl("lat_disk", "usecs", "Disk latency"); + + &tbl("misc_mhz", "mhz", "Processor clock rate"); + &tbl("bw_pipe", "MB", "Pipe bandwidth in MB / second"); + &tbl("bw_tcp_local", "MB", "Local TCP socket bandwidth in MB / second"); + &ipc; + &tbl("bw_file", "MB", "File write bandwidth in MB / second"); + &tbl("bw_reread", "MB", "(Re)Read in MB / second"); + &tbl("bw_mmap", "MB", "(Re)Read via mmap bandwidth in MB / second"); + &read; + &tbl2("bw_bcopy", "bw_bcopy_unrolled", "bw_bcopy_libc", "MB", + "Bcopy bandwidth in MB / second", "Unrolled", "Libc"); + &tbl("bw_mem_rdsum", "MB", "Memory read & sum bandwidth in MB / second"); + &tbl("bw_mem_wr", "MB", "Memory write bandwidth in MB / second"); + &mem; + +} else { + &bg("lat_nullsys", "usecs", "Number of null system calls per second"); + &bg("lat_signal", "usecs", "Number of signal handlers per second"); + &bg("lat_nullproc", "usecs", "Number of process forks/exits per second"); + &bg("lat_simpleproc", "usecs", "Number of simple process creates per second"); + &bg("lat_shproc", "usecs", "Number of simple process creates via /bin/sh per second"); + &bg("lat_ctx", "usecs", "Number of context switches per second, 2 small processes"); + &bg("lat_ctx8", "usecs", "Number of context switches per second, 8 small processes"); + + &bg("lat_pipe", "usecs", "Number of pipe transactions per second"); + &bg("lat_connect", "usecs", "Number of local TCP socket connections per second"); + &bg("lat_tcp_local", "usecs", "Number of local TCP socket transactions per second"); + &bg("lat_udp_local", "usecs", "Number of local UDP socket transactions per second"); + &bg("lat_rpc_udp_local", "usecs", + "Number of local RPC/UDP socket transactions per second"); + &bg("lat_rpc_tcp_local", "usecs", + "Number of local RPC/TCP socket transactions per second"); + &bg("lat_mappings", "usecs", "Number of memory mappings per second"); + &bg("lat_pagefault", "usecs", "Number of pagefaults per second"); + &bg("lat_fs_create", "usecs", "Number of file creates per second"); + + &bg("misc_mhz", "mhz", "Processor clock rate"); + &bg("bw_pipe", "MB", "Pipe bandwidth in MB / second"); + &bg("bw_tcp_local", "MB", "Local TCP socket bandwidth in MB / second"); + &bg("bw_file", "MB", "File write bandwidth in MB / second"); + &bg("bw_reread", "MB", "(Re)Read in MB / second"); + &bg("bw_mmap", "MB", "(Re)Read via mmap bandwidth in MB / second"); + &bg("bw_bcopy_libc", "MB", "Libc bcopy bandwidth in MB / second"); + &bg("bw_bcopy_unrolled", "MB", "Unrolled bcopy bandwidth in MB / second"); + &bg("bw_mem_rdsum", "MB", "Memory read & sum bandwidth in MB / second"); + &bg("bw_mem_wr", "MB", "Memory write bandwidth in MB / second"); +} + +exit 0; + +# Input looks like +# "benchmark name +# size value +# .... +# <blank line> +# +# Return the biggest vvalue before the blank line. +sub getbiggest +{ + local($msg) = @_; + + undef $save; + $value = 0; + while (<FD>) { + last if /^\s*$/; + $save = $_ if /^\d\./; + } + if (defined $save) { + $_ = $save; + @d = split; + $value = $d[1]; + } else { + warn "$file: no data for $msg\n"; + } + $value; +} + + +sub bigger +{ + local($v1, $v2) = ($a, $b); + + if ($sortN > 0) { + $v1 = (split(/\t/, $v1))[$sortN]; + $v2 = (split(/\t/, $v2))[$sortN]; + } else { + $v1 =~ s/.*\t//; + chop($v1); + $v2 =~ s/.*\t//; + chop($v2); + } + return ($v1 < $v2); +} + +sub smaller +{ + local($v1, $v2) = ($a, $b); + + if ($sortN > 0) { + $v1 = (split(/\t/, $v1))[$sortN]; + $v2 = (split(/\t/, $v2))[$sortN]; + } else { + $v1 =~ s/.*\t//; + chop($v1); + $v2 =~ s/.*\t//; + chop($v2); + } + $v1 =~ s/[^0-9]+//; + $v2 =~ s/[^0-9]+//; + return ($v1 > $v2); +} + +sub tbl +{ + local($graph, $units, $title) = @_; + local(@values, @tmp, $persec, $value); + + warn "tmp/$graph.tbl\n" if $v; + open(FD, ">tmp/$graph.tbl"); + print FD ".KS\n.TS\ncenter expand doublebox;\nl r.\nSystem\t$title\n=\n"; + for ($i = 0; $i <= $#uname; $i++) { + @info = &getinfo($uname[$i], $misc_mhz[$i]); + $XXX = '$value = $'.$graph.'[$i];'; + eval '$value = $'.$graph.'[$i];'; + $value = sprintf("%.1f", $value / 1000) if ($units eq "msecs"); + $value = sprintf("%.1f", $value) if ($units eq "MB"); + next if (!defined $value || $value <= 0); + $_ = "$info[3] $info[$#info]"; + &papernames; + push(@values, "$_\t$value\n"); + } + @values = sort smaller @values unless ($nosort); + # Somewhere an extra space is getting added. + foreach $_ (@values) { + s/^\s*//; + print FD; + } + print FD ".TE\n.KE\n"; + close(FD); +} + +sub tbl2 +{ + local($graph, $a, $b, $units, $title, $atitle, $btitle) = @_; + local(@values, @tmp, $line, $persec, $value); + + warn "tmp/$graph.tbl\n" if $v; + open(FD, ">tmp/$graph.tbl"); + print FD ".KS\n.TS\nexpand doublebox;\nl c c\nl r r.\n"; + print FD "System\t$atitle\t\\fB$btitle\\fP\n=\n"; + for ($i = 0; $i <= $#uname; $i++) { + @info = &getinfo($uname[$i], $misc_mhz[$i]); + eval '$value = $'.$a.'[$i];'; + next if (!defined $value || $value <= 0); + $value = sprintf("%.1f", $value / 1000) if ($units eq "msecs"); + $value = sprintf("%.1f", $value) if ($units eq "MB"); + $_ = "$info[3] $info[$#info]"; + &papernames; + $line = "$_\t$value\t"; + eval '$value = $'.$b.'[$i];'; + $value = sprintf("%.1f", $value / 1000) if ($units eq "msecs"); + $value = sprintf("%.1f", $value) if ($units eq "MB"); + next if (!defined $value || $value <= 0); + $line .= "$value\n"; + push(@values, $line); + } + unless ($nosort || $units eq "mhz") { + if ($units eq "MB") { + @values = sort bigger @values; + } else { + @values = sort smaller @values; + } + } + # Somewhere an extra space is getting added. + foreach $_ (@values) { + s/^\s*//; + print FD; + } + print FD ".TE\n.KE\n"; + close(FD); +} + +sub ipc +{ + local(@values, @tmp, $line, $persec, $value); + + open(FD, ">tmp/bw_ipc.tbl"); + print FD ".KS\n.TS\nexpand doublebox;\nl c c c\nl r r r.\n"; + print FD "System\tLibc bcopy\t\\fBpipe\\fP\tTCP\n=\n"; + for ($i = 0; $i <= $#uname; $i++) { + @info = &getinfo($uname[$i], $misc_mhz[$i]); + $value = $bw_bcopy_libc[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $_ = "$info[3] $info[$#info]"; + &papernames; + $line = "$_\t$value\t"; + $value = $bw_pipe[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $line .= "$value\t"; + $value = $bw_tcp_local[$i]; + $value = sprintf("%.0f", $value); + # next if ($value <= 0); + $line .= "$value\\ \n"; + push(@values, $line); + } + $sortN = 2; + @values = sort bigger @values unless ($nosort); + $sortN = 0; + # Somewhere an extra space is getting added. + foreach $_ (@values) { + s/^\s*//; + print FD; + } + print FD ".TE\n.KE\n"; + close(FD); +} + +sub read +{ + local(@values, @tmp, $line, $persec, $value); + + open(FD, ">tmp/bw_reread2.tbl"); + print FD ".KS\n.TS\nexpand doublebox;\nc|c c|c c\nl|c c|c c\nl|r r|r r.\n"; + print FD "\tLibc\t\\fBFile\\fP\tMemory\tFile\nSystem\tbcopy\t\\fBread\\fP\tread\tmmap\n=\n"; + for ($i = 0; $i <= $#uname; $i++) { + @info = &getinfo($uname[$i], $misc_mhz[$i]); + $value = $bw_bcopy_libc[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $_ = "$info[3] $info[$#info]"; + &papernames; + $line = "$_\t$value\t"; + $value = $bw_reread[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $line .= "$value\t"; + $value = $bw_mem_rdsum[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $line .= "$value\t"; + $value = $bw_mmap[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $line .= "$value\\ \n"; + push(@values, $line); + } + $sortN = 2; + @values = sort bigger @values unless ($nosort); + $sortN = 0; + # Somewhere an extra space is getting added. + foreach $_ (@values) { + s/^\s*//; + print FD; + } + print FD ".TE\n.KE\n"; + close(FD); +} + +sub mem +{ + local(@values, @tmp, $line, $persec, $value); + + open(FD, ">tmp/bw_allmem.tbl"); + print FD ".KS\n.TS\nexpand doublebox;\nc|c s|c s\nl|c c|c c\nl|r r|r r.\n"; + print FD "\tBcopy\tMemory\nSystem\t\\fBunrolled\\fP\tlibc\tread\twrite\n=\n"; + for ($i = 0; $i <= $#uname; $i++) { + @info = &getinfo($uname[$i], $misc_mhz[$i]); + $value = $bw_bcopy_unrolled[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $_ = "$info[3] $info[$#info]"; + &papernames; + $line = "$_\t$value\t"; + $value = $bw_bcopy_libc[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $line .= "$value\t"; + $value = $bw_mem_rdsum[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $line .= "$value\t"; + + $value = $bw_mem_wr[$i]; + $value = sprintf("%.0f", $value); + next if ($value <= 0); + $line .= "$value\\ \n"; + push(@values, $line); + } + $sortN = 1; + @values = sort bigger @values unless ($nosort); + $sortN = 0; + # Somewhere an extra space is getting added. + foreach $_ (@values) { + s/^\s*//; + print FD; + } + print FD ".TE\n.KE\n"; + close(FD); + + @values = (); + open(FD, ">tmp/lat_allmem.tbl"); + print FD ".KS\n.TS\nexpand doublebox;\nl c c c\nl c c c\nl r r r.\n"; + print FD "\tLevel 1\tLevel 2\tMain\n"; + print FD "System\tcache\tcache\tmemory\n=\n"; + for ($i = 0; $i <= $#uname; $i++) { + @info = &getinfo($uname[$i], $misc_mhz[$i]); + $value = $lat_l1[$i]; + next if ($value <= 0); + if (&same($lat_l1[$i], $lat_l2[$i])) { + $value = "--"; + } + $_ = "$info[3] $info[$#info]"; + &papernames; + $line = "$_\t$value\t"; + $value = $lat_l2[$i]; + next if ($value <= 0); + if (!&same($lat_l1[$i], $lat_l2[$i]) && + &same($lat_l2[$i], $lat_mem[$i])) { + $value = "--"; + } + $line .= "$value\t"; + $value = $lat_mem[$i]; + next if ($value <= 0); + $line .= "$value\\ \n"; + push(@values, $line); + } + + $sortN = 3; + @values = sort smaller @values unless ($nosort); + $sortN = 0; + # Somewhere an extra space is getting added. + foreach $_ (@values) { + s/^\s*//; + print FD; + } + print FD ".TE\n.KE\n"; + close(FD); +} + +sub procs +{ + local($graph, $a, $b, $c, $units) = @_; + local(@values, @tmp, $line, $persec, $value); + + warn "tmp/$graph.tbl\n" if $v; + open(FD, ">tmp/$graph.tbl"); + print FD ".KS\n.TS\nexpand doublebox;\nl|c|c|c\nl|r|r|r.\n"; + print FD "\tfork\t\\fBfork, exec\\fP\tfork, exec\n"; + print FD "System\t& exit\t\\fB& exit\\fP\tsh -c & exit\n=\n"; + for ($i = 0; $i <= $#uname; $i++) { + @info = &getinfo($uname[$i], $misc_mhz[$i]); + eval '$value = $'.$a.'[$i];'; + $value = sprintf("%.1f", $value / 1000); + next if ($value <= 0); + $_ = "$info[3] $info[$#info]"; + &papernames; + $line = "$_\t$value\t"; + eval '$value = $'.$b.'[$i];'; + $value = sprintf("%.0f", $value / 1000); + next if ($value <= 0); + $line .= "$value\\ \t"; + eval '$value = $'.$c.'[$i];'; + $value = sprintf("%.0f", $value / 1000); + next if ($value <= 0); + $line .= "$value\\ \n"; + push(@values, $line); + } + $sortN = 2; + @values = sort smaller @values unless ($nosort); + $sortN = 0; + # Somewhere an extra space is getting added. + foreach $_ (@values) { + s/^\s*//; + print FD; + } + print FD ".TE\n.KE\n"; + close(FD); +} + +sub ctx +{ + local(@values, @tmp, $line, $persec, $value); + + open(FD, ">tmp/ctx.tbl"); + print FD ".KS\n.TS\nexpand doublebox;\nc|c s|c s\nl|c c|c c\nl|r r|r r.\n"; + print FD "\t2 processes\t8 processes\nSystem\t\\fB0KB\\fP\t32KB\t0KB\t32KB\n=\n"; + for ($i = 0; $i <= $#uname; $i++) { + @info = &getinfo($uname[$i], $misc_mhz[$i]); + $_ = "$info[3] $info[$#info]"; + &papernames; + $line = "$_\t"; + foreach $a ('lat_ctx', 'lat_ctx32', 'lat_ctx8', 'lat_ctx32_8') { + eval '$value = $'.$a.'[$i];'; + $line .= "$value\t"; + } + chop($line); + push(@values, "$line\\ \n"); + } + $sortN = 1; + @values = sort smaller @values unless ($nosort); + $sortN = 0; + # Somewhere an extra space is getting added. + foreach $_ (@values) { + s/^\s*//; + print FD; + } + print FD ".TE\n.KE\n"; + close(FD); +} + +sub papernames +{ + $_ = "IBM PowerPC" if /AIX powerpc\@134/; + $_ = "IBM Power2" if /AIX rs6000-990\@71/; + $_ = "FreeBSD/i586" if /FreeBSD i586\@13[01234]/; + $_ = "HP 9000/819" if /HP-UX 9000.819\@/; + $_ = "HP K210" if /HP-UX 9000.859\@/; + $_ = "SGI Challenge/R10K" if /IRIX.* IP25\@/; + $_ = "SGI Challenge/R4K" if /IRIX.* IP19\@/; + $_ = "SGI Indigo2" if /IRIX.* IP22\@/; + $_ = "Linux/Alpha" if /Linux alpha\@/; + $_ = "Linux/i686" if /Linux i686\@/; + $_ = "Linux/i586" if /Linux i586\@/; + $_ = "DEC Alpha\@150" if /OSF1 alpha\@147/; + $_ = "DEC Alpha\@300" if /OSF1 alpha\@303/; + $_ = "Sun SC1000" if /SunOS-5.5 sun4d\@5/; + $_ = "Sun Ultra1" if /SunOS-5.5 sun4u/; + $_ = "Solaris/i686" if /SunOS-5.5.1 i86pc\@13/; + $_ = "Unixware/i686" if /UNIX_SV x86at/; +} + +sub bg +{ + local($graph, $units, $title) = @_; + local($persec, $value); + + if ($nosort) { + open(FD, ">tmp/$graph.bg"); + } else { + open(FD, "|sort -nr > tmp/$graph.bg"); + } + for ($i = 0; $i <= $#uname; $i++) { + @info = &getinfo($uname[$i], $misc_mhz[$i]); +# eval "\$value = \$$graph[$i];"; + + $XXX = '$value = $'.$graph.'[$i];'; + eval '$value = $'.$graph.'[$i];'; + if ($uname[$i] =~ /IRIX/) { + $fill = " %%fill0"; + } elsif ($uname[$i] =~ /HP/) { + $fill = " %%fill.3"; + } elsif ($uname[$i] =~ /AIX/) { + $fill = " %%fill.1"; + } elsif ($uname[$i] =~ /OSF/) { + $fill = " %%fill.5"; + } elsif ($uname[$i] =~ /Linux/) { + $fill = " %%fill.7"; + } elsif ($uname[$i] =~ /Sun/) { + $fill = " %%fill1"; + } else { + $fill = ""; + } + if ($units eq "usecs") { + if (!defined $value || $value <= 0) { + warn + "$ARGV[$i] $graph $info[$#info]: value is 0\n"; + $persec = 0; + $value = 0; + } else { + $persec = 1000000 / $value; + } + if (0) { + printf FD + "%.0f\t$info[3] $info[$#info] $value\\ $units$fill\n", + $persec; + } else { + printf FD + "%.0f\t%s %s $value\\ $units$fill\n", + $persec, $file[$i], &getos($uname[$i]); + } + } elsif ($units eq "MB") { + printf FD "$value\t$info[3] $info[$#info]$fill\n"; + } elsif ($units eq "mhz") { + printf FD "$value\t$info[3] $info[$#info]$fill\n"; + } else { + die "Unknown units: $units"; + } + } + if ($slide) { + print FD "%Title n $title\n"; + print FD "%ps 12\n"; + print FD "%ft HB\n"; + } else { + print FD "%Title n $title\n"; + print FD "%Title s lmbench v1.1\n"; + print FD "%ps 16\n"; + print FD "%ft R\n"; + } + close(FD); +} + +# Try and create sensible names from uname -a output +sub getinfo +{ + local(@info); + local($name); + local($mhz) = $_[1]; + + $mhz =~ s/[\. ].*//; + @info = split(/\s+/, $_[0]); + $name = pop(@info); + chop($name); + if ($name eq "mips") { + $name = "$info[$#info]@$mhz"; + } elsif ($_[0] =~ /HP-UX/) { + $name = "$info[7]@$mhz"; + } elsif ($_[0] =~ /SunOS/) { + $name = "$info[7]@$mhz"; + } elsif ($_[0] =~ /AIX/) { + $name = "$name@$mhz"; + } else { + $name .= "@$mhz"; + } + push(@info, $name); + @info; +} + +# Return true if the values differe by less than 10% +sub same +{ + local($a, $b) = @_; + + if ($a > $b) { + $percent = (($a - $b) / $a) * 100; + } else { + $percent = (($b - $a) / $b) * 100; + } + return ($percent <= 20); +} + +# Try and create sensible names from uname -a output +sub getos +{ + local(@info); + + @info = split(/\s+/, $_[0]); + $info[5] =~ s/-.*//; + "$info[3] $info[5]"; +} + diff --git a/performance/lmbench3/scripts/getbw b/performance/lmbench3/scripts/getbw new file mode 100755 index 0000000..27b182b --- /dev/null +++ b/performance/lmbench3/scripts/getbw @@ -0,0 +1,260 @@ + +# Extract the bandwith information. +# Usage: getbw file file.... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: getbw 1.6 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +# +# Default is file bandwidth which lists: mem read, file read (both), +# mmap read (both), bcopy. +# +# -mem turns off the file stuff but turns on rd, wr, rdwr, frd, fwr, +# bcopy, bzero, cp, fcp. +# +foreach $file (@ARGV) { + open(FD, $file); + &cache; + open(FD, $file); + ($f = $file) =~ s|/|-|; + if ($mem || $all) { + print "tmp/bwmem.$f\n"; + open(OUT, ">tmp/bwmem.$f"); + } else { + print "tmp/bwfile.$f\n"; + open(OUT, ">tmp/bwfile.$f"); + } + print OUT "%X Memory size \n%Y Bandwidth in MB/sec\n"; + while (<FD>) { + chop; + if (/^\[lmbench/) { + @_ = split; + if ($_[3] eq "SunOS") { + $_[3] .= "-$_[5]"; + } + $uname = "@_"; + } + if (/^\d+.*Mhz/) { + @_ = split; + $mhz = $_[0]; + $tmp = &getinfo("$uname", $mhz); + if ($mem) { + print OUT "%T Memory bandwidth for $tmp\n"; + } else { + print OUT "%T Reread bandwidth for $tmp\n"; + } + } + if (/MHZ/) { + @_ = split; + $mhz = $_[1]; + chop($mhz) if $mhz =~ /]$/; + $tmp = &getinfo("$uname", $mhz); + if ($mem) { + print OUT "%T Memory bandwidth for $tmp\n"; + } else { + print OUT "%T Reread bandwidth for $tmp\n"; + } + } + if ((!$all && !$mem) && /^"read bandwidth/) { + print OUT "\"File reread\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if ((!$all && !$mem) && /^"read open2close bandwidth/) { + print OUT "\"File open2close reread\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if ((!$all && !$mem) && /^"Mmap read bandwidth/) { + print OUT "\"File mmap reread\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if ((!$all && !$mem) && /^"Mmap read open2close bandwidth/) { + print OUT "\"File mmap open2close reread\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if ($all && /^"libc bcopy aligned/) { + print OUT "\"libc bcopy aligned\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if (/^"libc bcopy unaligned/) { + print OUT "\"libc bcopy unaligned\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if ($all && /^"unrolled bcopy aligned/) { + print OUT "\"libc bcopy unaligned\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if (($all || $mem) && /^"unrolled bcopy unaligned/) { + print OUT "\"unrolled bcopy unaligned\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if (($all || $mem) && /^"unrolled partial bcopy unaligned/) { + print OUT "\"unrolled partial bcopy unaligned\n"; + while (<FD>) { + last if /^\s*$/; + @_ = split; next unless $_[0] > $cache; + print OUT; + } + print OUT "\n"; + next; + } + if (/^Memory read bandwidth/) { + print OUT "\"$_\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if (($all || $mem) && /^Memory partial read bandwidth/) { + print OUT "\"$_\n"; + while (<FD>) { + last if /^\s*$/; + @_ = split; next unless $_[0] > $cache; + print OUT; + } + print OUT "\n"; + next; + } + if (($all || $mem) && /^Memory partial read.write bandwidth/) { + print OUT "\"$_\n"; + while (<FD>) { + last if /^\s*$/; + @_ = split; next unless $_[0] > $cache; + print OUT; + } + print OUT "\n"; + next; + } + if (($all || $mem) && /^Memory partial write bandwidth/) { + print OUT "\"$_\n"; + while (<FD>) { + last if /^\s*$/; + @_ = split; next unless $_[0] > $cache; + print OUT; + } + print OUT "\n"; + next; + } + if (($all || $mem) && /^Memory write bandwidth/) { + print OUT "\"$_\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + if (($all || $mem) && /^Memory bzero bandwidth/) { + print OUT "\"$_\n"; + while (<FD>) { + last if /^\s*$/; + print OUT; + } + print OUT "\n"; + next; + } + } +} + +# Paw through the data and figure out how big the L1 cache is. +# We look at the memory read performance and look for cluster breaks +# at 4, 8, 16, 32, 64, 126, and 256k. +sub cache +{ + local($in) = 0; + local($n, $sum, $avg) = (0,0,0); + + $cache = 0; + while (<FD>) { + if (/^Memory partial read bandwidth/) { + $in = 1; + next; + } + next unless $in; + @_ = split; + if ($n == 0) { + $sum += $_[1]; + $n++; + next; + } + $avg = $sum/$n; + if ($_[1] < .75*$avg) { + $cache = $last; + return; + } + $last = $_[0]; + $sum += $_[1]; + $n++; + } +} + +# Try and create sensible names from uname -a output +sub getinfo +{ + local(@info); + local($name); + local($mhz); + + $mhz = $_[1]; + $_ = $_[0]; + @info = split; + $name = pop(@info); + chop($name); + if ($name eq "unknown") { + $name = pop(@info); + } + if ($name eq "mips") { + $name = "$info[$#info]\@$mhz"; + } elsif ($_[0] =~ /HP-UX/) { + $name = "$info[7]\@$mhz"; + } elsif ($_[0] =~ /SunOS/) { + $name = "$info[7]\@$mhz"; + } else { + $name .= "\@$mhz"; + } + "$info[3] $name"; +} diff --git a/performance/lmbench3/scripts/getctx b/performance/lmbench3/scripts/getctx new file mode 100755 index 0000000..da7d645 --- /dev/null +++ b/performance/lmbench3/scripts/getctx @@ -0,0 +1,79 @@ + +# Extract the context switching information from lmbench result files. +# Usage: getctx file file.... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: getctx 1.8 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ss $0 "$@"' + if 0; + +$title = "foo" if 0; + +foreach $file (@ARGV) { + open(FD, $file); + while (<FD>) { + chop; + if (/^\[lmbench/) { + @_ = split; + if ($_[3] eq "SunOS") { + $_[3] .= "-$_[5]"; + } + $uname = "@_"; + } + if (/Mhz/) { + $mhz = $_; + } + if (/^.size=/) { + s/size/Process size/; + s/ ovr/\toverhead/; + @info = &getinfo($uname, $mhz); + ($f = $file) =~ s|/|-|; + print "tmp/ctx.$f\n"; + open(OUT, ">tmp/ctx.$f"); + print OUT "\"%X Processes \n\"%Y Time in microseconds\n"; + if ($title) { + print OUT "%T $f\n"; + } else { + print OUT + "\"%T Context switches for " . + "$info[3] $info[$#info]Mhz\n"; + } + print OUT "$_\n"; + while (<FD>) { + last if /^Null/ || /^Pipe/ || /^Memor/; + next if /\$Id/; + next if m|scripts/lmbench: /dev/tty|; + s/ ovr/\toverhead/; + s/size/Process size/; + print OUT; + } + close(OUT); + last; + } + } +} + +# Try and create sensible names from uname -a output +sub getinfo +{ + local(@info); + local($name); + local($mhz); + + ($mhz = $_[1]) =~ s/[\. ].*//; + @info = split(/\s+/, $_[0]); + $name = pop(@info); + chop($name); + if ($name eq "mips") { + $name = "$info[$#info]@$mhz"; + } elsif ($_[0] =~ /HP-UX/) { + $name = "$info[7]@$mhz"; + } elsif ($_[0] =~ /SunOS/) { + $name = "$info[7]@$mhz"; + } else { + $name .= "@$mhz"; + } + push(@info, $name); + @info; +} diff --git a/performance/lmbench3/scripts/getdisk b/performance/lmbench3/scripts/getdisk new file mode 100755 index 0000000..3d0199b --- /dev/null +++ b/performance/lmbench3/scripts/getdisk @@ -0,0 +1,69 @@ + +# Extract the disk graph data from lmbench result files. +# +# Hacked into existence by Larry McVoy +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: getdisk 1.2 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +foreach $file (@ARGV) { + open(FD, $file); + $file =~ s|/|-|; + while (<FD>) { + next unless /DISK_DESC/; + s/.DISK_DESC: //; + chop; chop; chop; + @_ = split(/[\[\]]/, $_); + foreach $_ (@_) { + next unless /:/; + @foo = split(/:/, $_); + $foo[0] =~ s|/dev/||; + $disks{$foo[0]} = $foo[1]; + } + last; + } + while (<FD>) { + if (/^"Seek times for \/dev\/(.*)$/) { + $ok = 0; + foreach $key (keys %disks) { + next unless $key eq $1; + $ok = 1; + } + if ($ok != 1) { + die "Disk results are screwed up, no $1.\n"; + } + print "tmp/seek_$1.$file\n"; + open(OUT, ">tmp/seek_$1.$file"); + print OUT "%T Seek times for $disks{$1}\n"; + print OUT "%X Seek distance (MB)\n"; + print OUT "%Y Time in millisec\n"; + while (<FD>) { + last unless /^\d/; + print OUT; + } + close(OUT); + } + if (/^"Zone bandwidth for \/dev\/(.*)$/) { + $ok = 0; + foreach $key (keys %disks) { + next unless $key eq $1; + $ok = 1; + } + if ($ok != 1) { + die "Disk results are screwed up, no $1.\n"; + } + print "tmp/zone_$1.$file\n"; + open(OUT, ">tmp/zone_$1.$file"); + print OUT "%T Zone bandwidths for $disks{$1}\n"; + print OUT "%X Disk offset (MB)\n"; + print OUT "%Y Bandwidth (MB/sec)\n"; + while (<FD>) { + last unless /^\d/; + print OUT; + } + close(OUT); + } + } +} +exit 0; diff --git a/performance/lmbench3/scripts/getlist b/performance/lmbench3/scripts/getlist new file mode 100755 index 0000000..8c35970 --- /dev/null +++ b/performance/lmbench3/scripts/getlist @@ -0,0 +1,31 @@ + +# Find everything in my results directory that looks like lmbench output. +# +# Hacked into existence by Larry McVoy (lm@xxxxxxxxxxxx) +# Copyright (c) 1994-1998 Larry McVoy. +# $Id$ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +$LIST = "no such file"; +$LIST = "LIST" if (-f "LIST"); +$LIST = $ARGV[0] if (($#ARGV == 0) && (-f $ARGV[0])); +if (-f $LIST) { + open(L, $LIST); + $_ = <L>; + chop; + @files = split; + close(L); +} else { + @files = <*/*>; +} +foreach $file (@files) { + next if $file =~ /\.INFO$/; + open(FD, $file) || next; + next unless defined($_ = <FD>); + close(FD); + next unless /^\[lmbench3.[01]/; + print "$file "; +} +print "\n"; +exit 0; diff --git a/performance/lmbench3/scripts/getmax b/performance/lmbench3/scripts/getmax new file mode 100755 index 0000000..754b50c --- /dev/null +++ b/performance/lmbench3/scripts/getmax @@ -0,0 +1,73 @@ + +# Look at a bunch of bargraph files and figure out the max amongst them all. +# Usage: getmax file file file.... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: getmax 1.10 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ + +eval 'exec perl -Ssw $0 "$@"' + if 0; + +$graph = 1 if 0; +$exit = 1; +foreach $file (@ARGV) { + $exit = 0 if -f $file; +} +exit $exit if $noop; + +$noop = 1 if 0; +$max_X = $max_Y = -1000000000; +$min_X = $min_Y = 1000000000; +foreach $file (@ARGV) { + next if $rmmax; + unless (open(FD, $file)) { + warn "Can't open $file\n"; + next; + } + while (<FD>) { + next if /^"/; + next if /^%/; + next if /^\s*$/; + next if m|scripts/lmbench: /dev/tty|; + @_ = split; + $min_X = $_[0] if ($_[0] < $min_X); + $min_Y = $_[1] if ($_[1] < $min_Y); + $max_X = $_[0] if ($_[0] > $max_X); + $max_Y = $_[1] if ($_[1] > $max_Y); + } + close(FD); +} +$half = 0 if 0; # lint +$max_X /= 2 if ($half); +foreach $file (@ARGV) { + unless (open(FD, $file)) { + warn "Can't open $file\n"; + next; + } + @lines = <FD>; + open(FD, ">$file") || die "Can't open $file\n"; + if ($graph) { + print FD "%fakemin-X $min_X\n"; + print FD "%fakemin-Y $min_Y\n"; + print FD "%fakemax-X $max_X\n"; + print FD "%fakemax-Y $max_Y\n"; + foreach $_ (@lines) { + next if /^%fakem/; + print FD; + } + warn "Max X is $max_X\n" if $v; + warn "Max Y is $max_Y\n" if $v; + } elsif ($rmmax) { + foreach $_ (@lines) { + next if /^%fakem/; + print FD; + } + } else { + print FD @lines; + print FD "%fakemax $max_X\n"; + warn "Max X is $max_X\n" if $v; + } + close(FD); +} +exit $exit; diff --git a/performance/lmbench3/scripts/getmem b/performance/lmbench3/scripts/getmem new file mode 100755 index 0000000..d3ea7ac --- /dev/null +++ b/performance/lmbench3/scripts/getmem @@ -0,0 +1,69 @@ + +# Extract the memory latency graph data from lmbench result files. +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: getmem 1.7 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ss $0 "$@"' + if 0; + +foreach $file (@ARGV) { + open(FD, $file); + $file =~ s|/|-|; + while (<FD>) { + chop; + next if m|scripts/lmbench: /dev/tty|; + if (/^\[lmbench/) { + @_ = split; + if ($_[3] eq "SunOS") { + $_[3] .= "-$_[5]"; + } + $uname = "@_"; + } + if (/Mhz/) { + $mhz = $_; + } + if (/^Memory load latency/) { + @info = &getinfo($uname, $mhz); + ($f = $file) =~ s|.*/||; + print "tmp/mem.$f\n"; + open(OUT, ">tmp/mem.$f"); + print OUT "\"%X Array size\n\"%Y Latency in nanoseconds\n"; + print OUT + "\"%T $file $info[3] $info[$#info] memory latencies\n"; + while (<FD>) { + next if /\$Id/; + next if /^\[/; + print OUT; + } + close(OUT); + last; + } + } +} +exit 0; + +# Try and create sensible names from uname -a output +sub getinfo +{ + local(@info); + local($name); + local($mhz) = $_[1]; + + $mhz =~ s/\..*//; + $mhz =~ s/ .*//; + @info = split(/\s+/, $_[0]); + $name = pop(@info); + chop($name); + if ($name eq "mips") { + $name = "$info[$#info]@$mhz"; + } elsif ($_[0] =~ /HP-UX/) { + $name = "$info[7]@$mhz"; + } elsif ($_[0] =~ /SunOS/) { + $name = "$info[7]@$mhz"; + } else { + $name .= "@$mhz"; + } + push(@info, $name); + @info; +} diff --git a/performance/lmbench3/scripts/getpercent b/performance/lmbench3/scripts/getpercent new file mode 100755 index 0000000..6ede4c2 --- /dev/null +++ b/performance/lmbench3/scripts/getpercent @@ -0,0 +1,400 @@ + +# Generate an ascii percentage summary from lmbench result files. +# Usage: getpercent file file file... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: getpercent 1.9 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +$n = 0; # apparently, hpux doesn't init to 0???? + +foreach $file (@ARGV) { + push(@files, $file); + open(FD, $file) || die "$0: can't open $file"; + $file =~ s|/|-|; + $file =~ s/\.\d+//; + push(@file, $file); + while (<FD>) { + chop; + next if m|scripts/lmbench: /dev/tty|; + if (/^\[lmbench/) { + split; + push(@uname, "@_"); + } + if (/Mhz/) { + split; + push(@misc_mhz, $_[0]); + } + if (/^Null syscall:/) { + split; + push(@lat_nullsys, $_[2]); + } + if (/^Pipe latency:/) { + split; + push(@lat_pipe, $_[2]); + } + if (/UDP latency using localhost:/) { + split; + push(@lat_udp_local, $_[4]); + } + if (/TCP latency using localhost/) { + split; + push(@lat_tcp_local, $_[4]); + } + if (/RPC.udp latency using localhost/) { + split; + push(@lat_rpc_udp_local, $_[4]); + } + if (/RPC.tcp latency using localhost/) { + split; + push(@lat_rpc_tcp_local, $_[4]); + } + if (/^Process fork.exit/) { + split; + push(@lat_nullproc, $_[2]); + } + if (/^Process fork.execve:/) { + split; + push(@lat_simpleproc, $_[2]); + } + if (/^Process fork..bin.sh/) { + split; + push(@lat_shproc, $_[3]); + } + if (/size=0 ovr=/) { + while (<FD>) { + next unless /^2/; + split; + push(@lat_ctx, $_[1]); + last; + } + while (<FD>) { + next unless /^8/; + split; + push(@lat_ctx8, $_[1]); + last; + } + } + if (/^Pipe bandwidth/) { + split; + push(@bw_pipe, $_[2]); + } + if (/^Socket bandwidth using localhost/) { + split; + push(@bw_tcp_local, $_[4]); + } + if (/^File .* write bandwidth/) { + split; + $bw = sprintf("%.2f", $_[4] / 1024.); + push(@bw_file, $bw); + } + if (/^"mappings/) { + $value = &getbiggest("memory mapping timing"); + push(@lat_mappings, $value); + } + if (/^"read bandwidth/) { + $value = &getbiggest("reread timing"); + push(@bw_reread, $value); + } + if (/^"Mmap read bandwidth/) { + $value = &getbiggest("mmap reread timing"); + push(@bw_mmap, $value); + } + if (/^"libc bcopy unaligned/) { + $value = &getbiggest("libc bcopy timing"); + push(@bw_bcopy_libc, $value); + } + if (/^"unrolled bcopy unaligned/) { + $value = &getbiggest("unrolled bcopy timing"); + push(@bw_bcopy_unrolled, $value); + } + if (/^Memory read/) { + $value = &getbiggest("memory read & sum timing"); + push(@bw_mem_rdsum, $value); + } + if (/^Memory write/) { + $value = &getbiggest("memory write timing"); + push(@bw_mem_wr, $value); + } + if (/^"stride=128/) { + $save = -1; + while (<FD>) { + if (/^0.00098\s/) { + split; + push(@lat_l1, $_[1]); + } elsif (/^0.12500\s/) { + split; + push(@lat_l2, $_[1]); + } elsif (/^[45678].00000\s/) { + split; + $size = $_[0]; + $save = $_[1]; + last if /^8.00000\s/; + } elsif (/^\s*$/) { + last; + } + } + if (!/^8/) { + warn "$file: No 8MB memory latency, using $size\n"; + } + push(@lat_mem, $save); + } + if (/^"stride=8192/) { # XXX assumes <= 8K pagesize + $tbl = -1; + while (<FD>) { + if (/^[45678].00000\s/) { + split; + $tlb = $_[1]; + $size = $_[0]; + last if /^8.00000\s/; + } + } + if (!/^8/) { + warn "$file: No 8MB tlb latency, using $size\n"; + } + push(@lat_tlb, $tlb); + } + } + foreach $array ( + 'misc_mhz', 'lat_nullsys', 'lat_pipe', 'lat_udp_local', + 'lat_tcp_local', 'lat_rpc_udp_local', + 'lat_rpc_tcp_local', 'lat_nullproc', 'lat_simpleproc', + 'lat_ctx', 'lat_ctx8', 'bw_pipe', 'bw_tcp_local', + 'bw_file', 'lat_mappings', 'bw_reread', 'bw_mmap', + 'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_mem_rdsum', + 'bw_mem_wr', 'lat_l1', 'lat_l2', 'lat_mem', 'lat_tlb', + ) { + eval "if (\$#$array != $n) { + warn \"No data for $array in $file\n\"; + push(\@$array, -1); + }"; + } + $n++; +} +exit 0; + +# Input looks like +# "benchmark name +# size value +# .... +# <blank line> +# +# Return the biggest vvalue before the blank line. +sub getbiggest +{ + local($msg) = @_; + + undef $save; + $value = 0; + while (<FD>) { + last if /^\s*$/; + $save = $_ if /^\d\./; + } + if (defined $save) { + $_ = $save; + @d = split; + $value = $d[1]; + if (int($d[0]) < 8) { + warn "$file: using $d[0] size for $msg\n"; + } + } else { + warn "$file: no data for $msg\n"; + } + $value; +} + + +print<<EOF; + + L M B E N C H 1 . 0 S U M M A R Y + ------------------------------------ + + Comparison to best of the breed + ------------------------------- + + (Best numbers are starred, i.e., *123) + + + Processor, Processes - factor slower than the best + -------------------------------------------------- +Host OS Mhz Null Null Simple /bin/sh Mmap 2-proc 8-proc + Syscall Process Process Process lat ctxsw ctxsw +--------- ------------- ---- ------- ------- ------- ------- ---- ------ ------ +EOF + +for ($i = 0; $i <= $#uname; $i++) { + printf "%-9.9s %13.13s ", $file[$i], &getos($uname[$i]); + printf "%4.0f %7s %7s %7s %7s %4s %6s %6s\n", + $misc_mhz[$i], + &smaller(@lat_nullsys, $i, 0), + &smaller(@lat_nullproc, $i, 1024), + &smaller(@lat_simpleproc, $i, 1024), + &smaller(@lat_shproc, $i, 1024), + &smaller(@lat_mappings, $i, 0), + &smaller(@lat_ctx, $i, 0), + &smaller(@lat_ctx8, $i, 0); + +} + +print<<EOF; + + *Local* Communication latencies - factor slower than the best + ------------------------------------------------------------- +Host OS Pipe UDP RPC/ TCP RPC/ + UDP TCP +--------- ------------- ------- ------- ------- ------- ------- +EOF + +for ($i = 0; $i <= $#uname; $i++) { + printf "%-9.9s %13.13s ", $file[$i], &getos($uname[$i]); + printf "%7s %7s %7s %7s %7s\n", + &smaller(@lat_pipe, $i, 0), + &smaller(@lat_udp_local, $i, 0), + &smaller(@lat_rpc_udp_local, $i, 0), + &smaller(@lat_tcp_local, $i, 0), + &smaller(@lat_rpc_tcp_local, $i, 0); + +} + +print<<EOF; + + *Local* Communication bandwidths - percentage of the best + --------------------------------------------------------- +Host OS Pipe TCP File Mmap Bcopy Bcopy Mem Mem + reread reread (libc) (hand) read write +--------- ------------- ---- ---- ------ ------ ------ ------ ---- ----- +EOF + +for ($i = 0; $i <= $#uname; $i++) { + printf "%-9.9s %13.13s ", $file[$i], &getos($uname[$i]); + printf "%4s %4s %6s %6s %6s %6s %4s %5s\n", + &bigger(@bw_pipe, $i), + &bigger(@bw_tcp_local, $i), + &bigger(@bw_reread, $i), + &bigger(@bw_mmap, $i), + &bigger(@bw_bcopy_libc, $i), + &bigger(@bw_bcopy_unrolled, $i), + &bigger(@bw_mem_rdsum, $i), + &bigger(@bw_mem_wr, $i); +} + +print<<EOF; + + Memory latencies in nanoseconds - factor slower than the best + (WARNING - may not be correct, check graphs) + ------------------------------------------------------------- +Host OS Mhz L1 \$ L2 \$ Main mem Guesses +--------- ------------- --- ---- ---- -------- ------- +EOF + +for ($i = 0; $i <= $#uname; $i++) { + printf "%-9.9s %13.13s %3d", + $file[$i], &getos($uname[$i]), $misc_mhz[$i]; + if ($lat_l1[$i] < 0) { + printf "%6s %6s %11s %s", + "-", "-", "-", + "Bad mhz?"; + } else { + $msg = &check_caches; + if ($msg =~ /L1/) { + $lat_l1[$i] = -1; + } elsif ($msg =~ /L2/) { + $lat_l2[$i] = -1; + } + printf "%6s %6s %11s", + &smaller(@lat_l1, $i, 0), + &smaller(@lat_l2, $i, 0), + &smaller(@lat_mem, $i, 0); + if ($msg =~ /L/) { + print "$msg"; + } + } + print "\n"; +} + + +exit 0; + +# Return factor of the smallest number. +sub smaller +{ + local(@values) = @_; + local($which, $min, $i, $units); + + $units = pop(@values); + $which = pop(@values); + $min = 0x7fffffff; + foreach $i (@values) { + next if $i == -1 || $i == 0; + $min = $i if ($min > $i); + } + if ($values[$which] == $min) { + #"***"; + if ($units == 1024) { + sprintf("*%.1fK", $values[$which]/1024.); + } else { + sprintf("*%d", $values[$which]); + } + } elsif ($values[$which] == -1) { + "???"; + } elsif ($values[$which] == 0) { + "???"; + } elsif ($values[$which] / $min < 10.0) { + sprintf("%.1f", $values[$which] / $min); + } else { + sprintf("%.0f", $values[$which] / $min); + } +} + +# Return closeness to the largest number as a percentage. +# Exact match is 100%, smaller numbers are like 15%. +sub bigger +{ + local(@values) = @_; + local($which, $max, $i); + + $which = pop(@values); + $max = 0; + foreach $i (@values) { + $max = $i if ($max < $i); + } + if ($values[$which] == $max) { + sprintf("*%d", $values[$which]); + } else { + sprintf("%d%%", $values[$which] / $max * 100); + } +} + +# Try and create sensible names from uname -a output +sub getos +{ + local(@info); + + @info = split(/\s+/, $_[0]); + "$info[3] $info[5]"; +} + +# Return true if the values differe by less than 10% +sub same +{ + local($a, $b) = @_; + + if ($a > $b) { + $percent = (($a - $b) / $a) * 100; + } else { + $percent = (($b - $a) / $b) * 100; + } + return ($percent <= 20); +} + +sub check_caches +{ + if (!&same($lat_l1[$i], $lat_l2[$i]) && + &same($lat_l2[$i], $lat_mem[$i])) { + " No L2 cache?"; + } elsif (&same($lat_l1[$i], $lat_l2[$i])) { + " No L1 cache?"; + } +} diff --git a/performance/lmbench3/scripts/getresults b/performance/lmbench3/scripts/getresults new file mode 100755 index 0000000..c5665b5 --- /dev/null +++ b/performance/lmbench3/scripts/getresults @@ -0,0 +1,99 @@ +#!/usr/bin/perl -ws + +# Search through the archives splitting out stuff that has pathnames. + +while (1) { + &headers; + &body; +} + +sub headers +{ + while (<>) { + warn "HDR $_" if ($debug); + return if /^\s*$/; + } + exit; +} + +# Save the info for the system, skipping everything ig there is no info. +sub body +{ + @info = (); + while (<>) { + last if m|^[-]+ \.\./results|; + last if /^\[lmbench/; + if (/^From[: ]/) { warn "FROM $_"; return; } + warn "INFO $_" if ($debug); + push(@info, $_); + } + if (/^[-]+ \.\.\/results/) { + @foo = split; + $path = $foo[1]; + $path =~ s|\.\./||; + warn "PATH $path\n" if ($debug); + &results; + return; + } + warn "SKIPPING one\n"; + while (<>) { + warn "SKIP $_" if ($SKIP); + last if /^Memory load latency/; + if (/^From[: ]/) { warn "FROM $_"; return; } + } + die "No memory load latency" unless /^Memory load latency/; + while (<>) { + warn "SKIP $_" if ($SKIP); + last if /^\[/; + if (/^From[: ]/) { warn "FROM $_"; return; } + } + die "No date" unless /^\[/; + while (<>) { + last unless /^\s*$/; + if (/^From[: ]/) { warn "FROM $_"; return; } + } +} + +sub results +{ + @results = (); + while (<>) { + goto done if (/^From[: ]/); + warn "RES $_" if ($RES); + push(@results, $_); + last if /^Memory load latency/; + } + die "No memory load latency" unless /^Memory load latency/; + while (<>) { + goto done if (/^From[: ]/); + warn "RES $_" if ($RES); + push(@results, $_); + last if /^\[/; + } + die "No date" unless /^\[/; + while (<>) { + last unless /^\s*$/; + } + +done: + ($dir = $path) =~ s|/[^/]+$||; + warn "DIR $dir\n" if ($debug); + system "mkdir -p $dir"; + if (-e $path) { + warn "CONFLICT on $path\n" if $debug; + for ($i = 0; ; $i++) { + $tmp = "${path}.${i}"; + last if ! -e $tmp; + warn "CONFLICT on $tmp\n" if $debug; + } + $path = $tmp; + } + $info = $path . ".INFO"; + open(O, ">$info"); + print O @info; + close(O); + warn "Saving $path\n" if $verbose; + open(O, ">$path"); + print O @results; + close(O); +} diff --git a/performance/lmbench3/scripts/getsummary b/performance/lmbench3/scripts/getsummary new file mode 100755 index 0000000..43bdae5 --- /dev/null +++ b/performance/lmbench3/scripts/getsummary @@ -0,0 +1,1089 @@ + +# Generate an ascii summary from lmbench result files. +# Usage: getsummary file file file... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: getsummary 1.34 05/02/17 16:40:22+02:00 staelin@xxxxxxxxxxxxxxxxxxxxxxxx $ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +# Use these constants to same typo-induced bugs later! +$M = 1000000.; +$K = 1000.; + +$n = 0; +foreach $file (@ARGV) { + open(FD, $file) || die "$0: can't open $file"; + $file =~ s/\.\d+$//; + @_ = split(/\//, $file); + push(@host, $_[$#_]); + $file = $_[$#_ - 1]; + $file =~ s|/|-|; + push(@file, $file); + $lat_mem_rd_type = -1; + $mhz = 0; + while (<FD>) { + chop; + next if m|scripts/lmbench: /dev/tty|; + if (/^\[lmbench/) { + $version = -1; + push(@uname, $_); + if (/lmbench1\./) { + $version = 1; + } + if (/lmbench2\./) { + $version = 2; + } + if (/lmbench3\./) { + $version = 3; + } + } + if (/MHZ/ && !$mhz) { + @_ = split; + $_[1] =~ s/\]//; + push(@misc_mhz, $_[1]); + $mhz = 1; + } elsif (/Mhz/ && !$mhz) { + @_ = split; + push(@misc_mhz, $_[0]); + $mhz = 1; + } + if (/^Select on 100 fd/) { + @_ = split; + push(@lat_fd_select, $_[4]); + } + if (/^Select on 100 tcp fd/) { + @_ = split; + push(@lat_tcp_select, $_[5]); + } + if (/^integer bit:/) { + @_ = split; + push(@integer_bit, $_[2]); + } + if (/^integer add:/) { + @_ = split; + push(@integer_add, $_[2]); + } + if (/^integer mul:/) { + @_ = split; + push(@integer_mul, $_[2]); + } + if (/^integer div:/) { + @_ = split; + push(@integer_div, $_[2]); + } + if (/^integer mod:/) { + @_ = split; + push(@integer_mod, $_[2]); + } + if (/^uint64 bit:/) { + @_ = split; + push(@int64_bit, $_[2]); + } + if (/^uint64 add:/) { + @_ = split; + push(@int64_add, $_[2]); + } + if (/^uint64 mul:/) { + @_ = split; + push(@int64_mul, $_[2]); + } + if (/^uint64 div:/) { + @_ = split; + push(@int64_div, $_[2]); + } + if (/^uint64 mod:/) { + @_ = split; + push(@int64_mod, $_[2]); + } + if (/^float add:/) { + @_ = split; + push(@float_add, $_[2]); + } + if (/^float mul:/) { + @_ = split; + push(@float_mul, $_[2]); + } + if (/^float div:/) { + @_ = split; + push(@float_div, $_[2]); + } + if (/^double add:/) { + @_ = split; + push(@double_add, $_[2]); + } + if (/^double mul:/) { + @_ = split; + push(@double_mul, $_[2]); + } + if (/^double div:/) { + @_ = split; + push(@double_div, $_[2]); + } + if (/^float bogomflops:/) { + @_ = split; + push(@float_bogomflops, $_[2]); + } + if (/^double bogomflops:/) { + @_ = split; + push(@double_bogomflops, $_[2]); + } + if (/LINE_SIZE/) { + @_ = split; + $_[1] =~ s/\]//; + push(@line_size, $_[1]); + } + if (/SYNC_MAX/) { + @_ = split; + $_[1] =~ s/\]//; + push(@load, $_[1]); + } + if (/^tlb:/) { + @_ = split; + push(@tlb, $_[1]); + } + if (/^Simple syscall:/) { + @_ = split; + push(@lat_syscall, $_[2]); + } + if (/^Simple read:/) { + @_ = split; + push(@lat_read, $_[2]); + } + if (/^Simple write:/) { + @_ = split; + push(@lat_write, $_[2]); + } + if (/^Simple stat:/) { + @_ = split; + push(@lat_stat, $_[2]); + } + if (/^Simple open.close:/) { + @_ = split; + push(@lat_openclose, $_[2]); + } + if (/^Null syscall:/) { # Old format. + @_ = split; + push(@lat_write, $_[2]); + } + if (/^Signal handler installation:/) { + @_ = split; + push(@lat_siginstall, $_[3]); + } + if (/^Signal handler overhead:/) { + @_ = split; + push(@lat_sigcatch, $_[3]); + } + if (/^Protection fault:/) { + @_ = split; + push(@lat_protfault, $_[2]); + } + if (/^Pipe latency:/) { + @_ = split; + push(@lat_pipe, $_[2]); + } + if (/AF_UNIX sock stream latency:/) { + @_ = split; + push(@lat_unix, $_[4]); + } + if (/UDP latency using localhost:/) { + @_ = split; + push(@lat_udp_local, $_[4]); + } elsif (/UDP latency using/) { + @_ = split; + push(@lat_udp_remote, $_[4]); + } + if (/TCP latency using localhost:/) { + @_ = split; + push(@lat_tcp_local, $_[4]); + } elsif (/TCP latency using/) { + @_ = split; + push(@lat_tcp_remote, $_[4]); + } + if (/RPC.udp latency using localhost:/) { + @_ = split; + push(@lat_rpc_udp_local, $_[4]); + } elsif (/RPC.udp latency using/) { + @_ = split; + push(@lat_rpc_udp_remote, $_[4]); + } + if (/RPC.tcp latency using localhost:/) { + @_ = split; + push(@lat_rpc_tcp_local, $_[4]); + } elsif (/RPC.tcp latency using/) { + @_ = split; + push(@lat_rpc_tcp_remote, $_[4]); + } + if (/TCP.IP connection cost to localhost:/) { + @_ = split; + push(@lat_tcp_connect_local, $_[5]); + } elsif (/TCP.IP connection cost to/) { + @_ = split; + push(@lat_tcp_connect_remote, $_[5]); + } + if (/^Socket bandwidth using localhost/) { + $value = &getbiggest("Socket bandwidth using localhost"); + push(@bw_tcp_local, $value); +# } elsif (/^Socket bandwidth using /) { +# $value = &getbiggest("Socket bandwidth using remote"); +# push(@bw_tcp_remote, $value); + } + if (/^AF_UNIX sock stream bandwidth:/) { + @_ = split; + push(@bw_unix, $_[4]); + } + if (/^Process fork.exit/) { + @_ = split; + push(@lat_nullproc, $_[2]); + } + if (/^Process fork.execve:/) { + @_ = split; + push(@lat_simpleproc, $_[2]); + } + if (/^Process fork..bin.sh/) { + @_ = split; + push(@lat_shproc, $_[3]); + } + if (/^Pipe bandwidth/) { + @_ = split; + push(@bw_pipe, $_[2]); + } + if (/^File .* write bandwidth/) { + @_ = split; + $bw = sprintf("%.2f", $_[4] / 1024.); + push(@bw_file, $bw); + } + if (/^Pagefaults on/) { + @_ = split; + push(@lat_pagefault, $_[3]); + } + if (/^"mappings/) { + $value = &getbiggest("memory mapping timing"); + push(@lat_mappings, $value); + } + if (/^"read bandwidth/) { + $value = &getbiggest("reread timing"); + push(@bw_reread, $value); + } + if (/^"Mmap read bandwidth/) { + $value = &getbiggest("mmap reread timing"); + push(@bw_mmap, $value); + } + if (/^"libc bcopy unaligned/) { + $value = &getbiggest("libc bcopy timing"); + push(@bw_bcopy_libc, $value); + } + if (/^"unrolled bcopy unaligned/) { + $value = &getbiggest("unrolled bcopy timing"); + push(@bw_bcopy_unrolled, $value); + } + if (/^Memory read/) { + $value = &getbiggest("memory read & sum timing"); + push(@bw_mem_rdsum, $value); + } + if (/^Memory write/) { + $value = &getbiggest("memory write timing"); + push(@bw_mem_wr, $value); + } + if (/^Memory load parallelism/) { + $value = &getbiggest("Memory load parallelism"); + push(@mem_load_par, $value); + } + + if (/^"File system latency/) { + while (<FD>) { + next if /Id:/; + if (/^0k/) { + @_ = split; + push(@fs_create_0k, $_[2]); + push(@fs_delete_0k, $_[3]); + } elsif (/^1k/) { + @_ = split; + push(@fs_create_1k, $_[2]); + push(@fs_delete_1k, $_[3]); + } elsif (/^4k/) { + @_ = split; + push(@fs_create_4k, $_[2]); + push(@fs_delete_4k, $_[3]); + } elsif (/^10k/) { + @_ = split; + push(@fs_create_10k, $_[2]); + push(@fs_delete_10k, $_[3]); + } else { + last; + } + } + } + if (/size=0/) { + while (<FD>) { + if (/^2 /) { + @_ = split; push(@lat_ctx0_2, $_[1]); + } elsif (/^8 /) { + @_ = split; push(@lat_ctx0_8, $_[1]); + } elsif (/^16 /) { + @_ = split; push(@lat_ctx0_16, $_[1]); + } + last if /^\s*$/ || /^Memory/; + } + } + if (/size=16/) { + while (<FD>) { + if (/^2 /) { + @_ = split; push(@lat_ctx16_2, $_[1]); + } elsif (/^8 /) { + @_ = split; push(@lat_ctx16_8, $_[1]); + } elsif (/^16 /) { + @_ = split; push(@lat_ctx16_16, $_[1]); + } + last if /^\s*$/; + } + } + if (/size=64/) { + while (<FD>) { + if (/^2 /) { + @_ = split; push(@lat_ctx64_2, $_[1]); + } elsif (/^8 /) { + @_ = split; push(@lat_ctx64_8, $_[1]); + } elsif (/^16 /) { + @_ = split; push(@lat_ctx64_16, $_[1]); + } + last if /^\s*$/ || /^20/; + } + } + if (/^Memory load latency/) { + $lat_mem_rd_type = 1; + } + if (/^Random load latency/) { + $lat_mem_rd_type = 2; + } + if (/^"stride=128/) { + $save = -1; + while (<FD>) { + if (/^\s*$/) { + last; + } + @_ = split; + $size = $_[0]; + $save = $_[1]; + if ($size == 0.00098 && $lat_mem_rd_type == 1) { + push(@lat_l1, $_[1]); + } elsif ($size == 0.12500 && $lat_mem_rd_type == 1) { + push(@lat_l2, $_[1]); + } + } + if ($size < 8.0) { + warn "$file: No 8MB memory latency, using $size\n"; + } + if ($lat_mem_rd_type == 1) { + push(@lat_mem, $save); + } + } + if (/^"stride=16/) { + $save = -1; + while (<FD>) { + if (/^\s*$/) { + last; + } + @_ = split; + $size = $_[0]; + $save = $_[1]; + } + if ($size < 8.0) { + warn "$file: No 8MB random access memory latency, using $size\n"; + } + if ($lat_mem_rd_type == 2) { + warn "$file: lat_mem_rand = $save\n"; + push(@lat_mem_rand, $save); + } + } + } + @warn = (); + foreach $array ( + 'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_file', + 'bw_mem_rdsum', 'bw_mem_wr', 'bw_mmap', 'bw_pipe', + 'bw_reread', 'bw_tcp_local', 'bw_tcp_remote', 'bw_unix', + 'double_add', 'double_bogomflops', 'double_div', 'double_mul', + 'float_add', 'float_bogomflops', 'float_div', 'float_mul', + 'fs_create_0k', 'fs_create_1k', 'fs_create_4k', + 'fs_create_10k', 'fs_delete_0k', 'fs_delete_1k', + 'fs_delete_4k', 'fs_delete_10k', 'integer_add', + 'integer_bit', 'integer_div', 'integer_mod', 'integer_mul', + 'lat_ctx0_2', 'lat_ctx0_8', 'lat_ctx0_16', + 'lat_ctx16_2', 'lat_ctx16_8', 'lat_ctx16_16', + 'lat_ctx64_2', 'lat_ctx64_8', 'lat_ctx64_16', + 'lat_l1', 'lat_l2', 'lat_mappings', 'lat_mem', + 'lat_mem_rand', 'lat_nullproc', + 'lat_openclose', 'lat_pagefault', 'lat_pipe', + 'lat_protfault', 'lat_read', 'lat_rpc_tcp_local', + 'lat_rpc_tcp_remote', 'lat_rpc_udp_local', + 'lat_rpc_udp_remote', 'lat_fd_select', 'lat_tcp_select', + 'lat_shproc', 'lat_sigcatch', 'lat_siginstall', + 'lat_simpleproc', 'lat_stat', 'lat_syscall', + 'lat_tcp_connect_local', 'lat_tcp_connect_remote', + 'lat_tcp_local', 'lat_tcp_remote', + 'lat_udp_local', 'lat_udp_remote', 'lat_unix', 'lat_write', + 'line_size', 'mem_load_par', 'misc_mhz', 'tlb', 'load', + 'int64_add', 'int64_bit', 'int64_div', 'int64_mod', + 'int64_mul' + ) { + $last = eval '$#' . $array; + if ($last != $n) { + #warn "No data for $array in $file\n"; + push(@warn, $array); + eval 'push(@' . $array . ', -1);'; + } + } +# if ($#warn != -1) { +# warn "Missing data in $file: @warn\n"; +# } + $n++; +} + +print<<EOF; + + L M B E N C H 3 . 0 S U M M A R Y + ------------------------------------ + (Alpha software, do not distribute) + +EOF + +&print_basic; +&print_process; +&print_int; +&print_uint64; +&print_float; +&print_double; +&print_ctx; +&print_ipc_local; +&print_ipc_remote; +&print_file_vm; +&print_bw_ipc_local; +&print_mem; + +exit 0; + +sub print_basic +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'tlb', 'line_size', 'mem_load_par' )) <= 0) { + return; + } + print<<EOF; +Basic system parameters +------------------------------------------------------------------------------ +Host OS Description Mhz tlb cache mem scal + pages line par load + bytes +--------- ------------- ----------------------- ---- ----- ----- ------ ---- +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'tlb', 'line_size', 'mem_load_par' )) <= 0) { + next; + } + printf "%-9.9s %13.13s %23.23s ", + $host[$i], &getos($uname[$i]), $file[$i]; + printf "%4.4s %5.5s %5.5s %6.6s %4.4s\n", + &inum($misc_mhz[$i], 4), + &inum($tlb[$i], 5), + &inum($line_size[$i], 5), + &num($mem_load_par[$i], 6), + &inum($load[$i], 4); + } +} + +sub print_process +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'lat_syscall', 'lat_read', 'lat_write', + 'lat_stat', 'lat_openclose', 'lat_tcp_select', + 'lat_siginstall', 'lat_sigcatch', + 'lat_nullproc', 'lat_simpleproc', + 'lat_shproc' )) <= 0) { + return; + } + print<<EOF; + +Processor, Processes - times in microseconds - smaller is better +------------------------------------------------------------------------------ +Host OS Mhz null null open slct sig sig fork exec sh + call I/O stat clos TCP inst hndl proc proc proc +--------- ------------- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- +EOF + + @fs_delete_4k = @lat_ctx0_8 = @bw_file = @lat_ctx0_16 = @fs_delete_1k = + @fs_create_4k = @fs_create_1k + if 0; # lint + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'lat_syscall', 'lat_read', 'lat_write', + 'lat_stat', 'lat_openclose', 'lat_tcp_select', + 'lat_siginstall', 'lat_sigcatch', + 'lat_nullproc', 'lat_simpleproc', + 'lat_shproc' )) <= 0) { + next; + } + # If they have no /dev/zero, use /dev/null, else average them. + if ($lat_read[$i] == -1) { + $tmp = $lat_write[$i]; + } else { + $tmp = ($lat_read[$i] + $lat_write[$i]) / 2; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + printf "%4.0f %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s\n", + $misc_mhz[$i], + &num($lat_syscall[$i], 4), + &num($tmp, 4), + &num($lat_stat[$i], 4), + &num($lat_openclose[$i], 4), + &num($lat_tcp_select[$i], 4), + &num($lat_siginstall[$i], 4), + &num($lat_sigcatch[$i], 4), + &num($lat_nullproc[$i], 4), + &num($lat_simpleproc[$i], 4), + &num($lat_shproc[$i], 4); + } +} + +sub print_int +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'integer_bit', 'integer_add', + 'integer_mul', 'integer_div', + 'integer_mod' )) <= 0) { + return; + } + print<<EOF; + +Basic integer operations - times in nanoseconds - smaller is better +------------------------------------------------------------------- +Host OS intgr intgr intgr intgr intgr + bit add mul div mod +--------- ------------- ------ ------ ------ ------ ------ +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'integer_bit', 'integer_add', + 'integer_mul', 'integer_div', + 'integer_mod' )) <= 0) { + next; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + printf "%6.6s %6.6s %6.6s %6.6s %6.6s\n", + &scale_num($integer_bit[$i], 6, $load[$i]), + &scale_num($integer_add[$i], 6, $load[$i]), + &scale_num($integer_mul[$i], 6, $load[$i]), + &scale_num($integer_div[$i], 6, $load[$i]), + &scale_num($integer_mod[$i], 6, $load[$i]); + } +} + +sub print_uint64 +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'int64_bit', 'int64_add', + 'int64_mul', 'int64_div', + 'int64_mod' )) <= 0) { + return; + } + print<<EOF; + +Basic uint64 operations - times in nanoseconds - smaller is better +------------------------------------------------------------------ +Host OS int64 int64 int64 int64 int64 + bit add mul div mod +--------- ------------- ------ ------ ------ ------ ------ +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'int64_bit', 'int64_add', + 'int64_mul', 'int64_div', + 'int64_mod' )) <= 0) { + next; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + printf " %5.5s %6.6s %6.6s %6.6s %6.6s\n", + &scale_num($int64_bit[$i], 6, $load[$i]), + &scale_num($int64_add[$i], 6, $load[$i]), + &scale_num($int64_mul[$i], 6, $load[$i]), + &scale_num($int64_div[$i], 6, $load[$i]), + &scale_num($int64_mod[$i], 6, $load[$i]); + } +} + +sub print_float +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'float_add', 'float_mul', 'float_div', + 'float_bogomflops' )) <= 0) { + return; + } + print<<EOF; + +Basic float operations - times in nanoseconds - smaller is better +----------------------------------------------------------------- +Host OS float float float float + add mul div bogo +--------- ------------- ------ ------ ------ ------ +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'float_add', 'float_mul', + 'float_div', + 'float_bogomflops' )) <= 0) { + next; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + printf "%6.6s %6.6s %6.6s %6.6s\n", + &scale_num($float_add[$i], 6, $load[$i]), + &scale_num($float_mul[$i], 6, $load[$i]), + &scale_num($float_div[$i], 6, $load[$i]), + &scale_num($float_bogomflops[$i], 6, $load[$i]); + } +} + +sub print_double +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'double_add', 'double_mul', 'double_div', + 'double_bogomflops' )) <= 0) { + return; + } + print<<EOF; + +Basic double operations - times in nanoseconds - smaller is better +------------------------------------------------------------------ +Host OS double double double double + add mul div bogo +--------- ------------- ------ ------ ------ ------ +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'double_add', 'double_mul', + 'double_div', + 'double_bogomflops' )) <= 0) { + next; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + printf "%6.6s %6.6s %6.6s %6.6s\n", + &scale_num($double_add[$i], 6, $load[$i]), + &scale_num($double_mul[$i], 6, $load[$i]), + &scale_num($double_div[$i], 6, $load[$i]), + &scale_num($double_bogomflops[$i], 6, $load[$i]); + } +} + +sub print_ctx +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'lat_ctx0_2', 'lat_ctx16_2', + 'lat_ctx64_2', 'lat_ctx16_8', + 'lat_ctx64_8', 'lat_ctx16_16', + 'lat_ctx64_16' )) <= 0) { + return; + } + print<<EOF; + +Context switching - times in microseconds - smaller is better +------------------------------------------------------------------------- +Host OS 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K + ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw +--------- ------------- ------ ------ ------ ------ ------ ------- ------- +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'lat_ctx0_2', 'lat_ctx16_2', + 'lat_ctx64_2', 'lat_ctx16_8', + 'lat_ctx64_8', 'lat_ctx16_16', + 'lat_ctx64_16' )) <= 0) { + next; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + printf "%6.6s %6.6s %6.6s %6.6s %6.6s %7.7s %7.7s\n", + &num($lat_ctx0_2[$i], 6), + &num($lat_ctx16_2[$i], 6), + &num($lat_ctx64_2[$i], 6), + &num($lat_ctx16_8[$i], 6), + &num($lat_ctx64_8[$i], 6), + &num($lat_ctx16_16[$i], 7), + &num($lat_ctx64_16[$i], 7); + } +} + +sub print_ipc_local +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'lat_ctx0_2', 'lat_pipe', + 'lat_unix', 'lat_udp_local', + 'lat_rpc_udp_local', 'lat_tcp_local', + 'lat_rpc_tcp_local', + 'lat_tcp_connect_local' )) <= 0) { + return; + } + print<<EOF; + +*Local* Communication latencies in microseconds - smaller is better +--------------------------------------------------------------------- +Host OS 2p/0K Pipe AF UDP RPC/ TCP RPC/ TCP + ctxsw UNIX UDP TCP conn +--------- ------------- ----- ----- ---- ----- ----- ----- ----- ---- +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'lat_ctx0_2', 'lat_pipe', + 'lat_unix', 'lat_udp_local', + 'lat_rpc_udp_local', 'lat_tcp_local', + 'lat_rpc_tcp_local', + 'lat_tcp_connect_local' )) <= 0) { + next; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + printf "%5.5s %5.5s %4.4s %5.5s %5.5s %5.5s %5.5s %4.4s\n", + &num($lat_ctx0_2[$i], 5), + &num($lat_pipe[$i], 5), + &num($lat_unix[$i], 4), + &num($lat_udp_local[$i], 5), + &num($lat_rpc_udp_local[$i], 5), + &num($lat_tcp_local[$i], 5), + &num($lat_rpc_tcp_local[$i], 5), + &scale_num($lat_tcp_connect_local[$i], 5, $load[$i]); + } +} + +sub print_ipc_remote +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'lat_udp_remote', + 'lat_rpc_udp_remote', 'lat_tcp_remote', + 'lat_rpc_tcp_remote', + 'lat_tcp_connect_remote' )) <= 0) { + return; + } + print<<EOF; + +*Remote* Communication latencies in microseconds - smaller is better +--------------------------------------------------------------------- +Host OS UDP RPC/ TCP RPC/ TCP + UDP TCP conn +--------- ------------- ----- ----- ----- ----- ---- +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'lat_udp_remote', + 'lat_rpc_udp_remote', 'lat_tcp_remote', + 'lat_rpc_tcp_remote', + 'lat_tcp_connect_remote' )) <= 0) { + next; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + printf "%5.5s %5.5s %5.5s %5.5s %4.4s\n", + &num($lat_udp_remote[$i], 5), + &num($lat_rpc_udp_remote[$i], 5), + &num($lat_tcp_remote[$i], 5), + &num($lat_rpc_tcp_remote[$i], 5), + &scale_num($lat_tcp_connect_remote[$i], 4, $load[$i]); + } +} + +sub print_file_vm +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'fs_create_0k', 'fs_create_10k', + 'fs_delete_0k', 'fs_delete_10k', + 'lat_mappings', 'lat_protfault', + 'lat_pagefault' )) <= 0) { + return; + } + print<<EOF; + +File & VM system latencies in microseconds - smaller is better +------------------------------------------------------------------------------- +Host OS 0K File 10K File Mmap Prot Page 100fd + Create Delete Create Delete Latency Fault Fault selct +--------- ------------- ------ ------ ------ ------ ------- ----- ------- ----- +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'fs_create_0k', 'fs_create_10k', + 'fs_delete_0k', 'fs_delete_10k', + 'lat_mappings', 'lat_protfault', + 'lat_pagefault' )) <= 0) { + next; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + $c0k = $fs_create_0k[$i] <= 0 ? -1 : $M / $fs_create_0k[$i]; + $c10k = $fs_create_10k[$i] <= 0 ? -1 : $M / $fs_create_10k[$i]; + $d0k = $fs_delete_0k[$i] <= 0 ? -1 : $M / $fs_delete_0k[$i]; + $d10k = $fs_delete_10k[$i] <= 0 ? -1 : $M / $fs_delete_10k[$i]; + printf "%6.6s %6.6s %6.6s %6.6s %7.7s %5.5s %7.7s %5.5s\n", + &scale_num($c0k, 6, $load[$i]), + &scale_num($d0k, 6, $load[$i]), + &scale_num($c10k, 6, $load[$i]), + &scale_num($d10k, 6, $load[$i]), + &num($lat_mappings[$i], 7), + &num($lat_protfault[$i], 5), + &num($lat_pagefault[$i], 7), + &num($lat_fd_select[$i], 5); + } +} + +sub print_bw_ipc_local +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'bw_pipe', 'bw_unix', + 'bw_tcp_local', 'bw_reread', + 'bw_bcopy_libc', 'bw_bcopy_unrolled', + 'bw_mem_rdsum' , 'bw_mem_wr' )) <= 0) { + return; + } + print<<EOF; + +*Local* Communication bandwidths in MB/s - bigger is better +----------------------------------------------------------------------------- +Host OS Pipe AF TCP File Mmap Bcopy Bcopy Mem Mem + UNIX reread reread (libc) (hand) read write +--------- ------------- ---- ---- ---- ------ ------ ------ ------ ---- ----- +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'bw_pipe', 'bw_unix', + 'bw_tcp_local', 'bw_reread', + 'bw_bcopy_libc', 'bw_bcopy_unrolled', + 'bw_mem_rdsum' , 'bw_mem_wr' )) <= 0) { + next; + } + printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]); + printf "%4.4s %4.4s %4.4s %6.6s %6.6s %6.6s %6.6s %4.4s %5.5s\n", + &num($bw_pipe[$i], 4), + &num($bw_unix[$i], 4), + &num($bw_tcp_local[$i], 4), + &num($bw_reread[$i], 6), + &num($bw_mmap[$i], 6), + &num($bw_bcopy_libc[$i], 6), + &num($bw_bcopy_unrolled[$i], 6), + &num($bw_mem_rdsum[$i], 4), + &num($bw_mem_wr[$i], 5); + } +} + +sub print_mem +{ + local($i); + local($t); + + if (&resultsq(0, $#uname, ( 'lat_l1', 'lat_l2', 'lat_mem' )) <= 0) { + return; + } + print<<EOF; + +Memory latencies in nanoseconds - smaller is better + (WARNING - may not be correct, check graphs) +------------------------------------------------------------------------------ +Host OS Mhz L1 \$ L2 \$ Main mem Rand mem Guesses +--------- ------------- --- ---- ---- -------- -------- ------- +EOF + + for ($i = 0; $i <= $#uname; $i++) { + if (&resultsq($i, $i, ( 'lat_l1', 'lat_l2', 'lat_mem' )) <= 0) { + next; + } + printf "%-9.9s %13.13s %4d", + $host[$i], &getos($uname[$i]), $misc_mhz[$i]; + $msg = &check_caches; + if ($lat_l1[$i] < 0) { + printf "%6s %6s %11s %s", + "-", "-", "-", + "Bad mhz?"; + } else { + printf " %6.6s %6.6s %6.6s %11.11s", + &num($lat_l1[$i], 6), + &num($lat_l2[$i], 6), + &num($lat_mem[$i], 6), + &num($lat_mem_rand[$i], 6); + print $msg if ($msg =~ /L/); + } + print "\n"; + } +} + + +# checks to see if there are any valid results +# +sub resultsq +{ + local($low, $high, @pars) = @_; + local($i); + local($val); + + for ($i = $low; $i <= $high; $i++) { + foreach $p (@pars) { + $val = eval '$' . $p . '[' . $i . ']'; + if ($val > 0) { + return (1); + } + } + } + return (0); +} + +# (33, %3d) +sub inum +{ + local($val, $len) = @_; + local($str) = ""; + local($i); + + if (!defined($val) || !($val =~ /^[ ]*[0-9.]+[ ]*$/)) { + $val = -1; + } + if ($val <= 0) { + $str = ""; + for ($i = 0; $i < $len; $i++) { + $str .= " "; + } + return ($str); + } + + $fmt = sprintf("%%%dd", $len); + $str = sprintf($fmt, $val); + + $str; +} +# (33, %3d, scale) +sub scale_num +{ + local($val, $len, $scale) = @_; + + if ($scale > 1) { + $val = -1 + } + return (&num($val, $len)); +} +# (33, %3d) +sub num +{ + local($val, $len) = @_; + local($str) = ""; + local($i); + + if (!defined($val) || !($val =~ /^[ ]*[0-9.]+[ ]*$/)) { + $val = -1; + } + if ($val <= 0) { + $str = ""; + for ($i = 0; $i < $len; $i++) { + $str .= " "; + } + return ($str); + } + if ($val >= 10 * $M) { + $nstr = sprintf("%.1f", $val / $M); + $fmt = sprintf("%%%d.%ds%%s", $len - 1, $len - 1); + $str = sprintf($fmt, $nstr, "M"); + } elsif ($val >= 10 * $K) { + $nstr = sprintf("%.1f", $val / $K); + $fmt = sprintf("%%%d.%ds%%s", $len - 1, $len - 1); + $str = sprintf($fmt, $nstr, "K"); + } elsif ($val >= 10) { + $nstr = sprintf("%.1f", $val); + $fmt = sprintf("%%%d.%ds", $len, $len); + $str = sprintf($fmt, $nstr); + } elsif ($val < 0.001) { + $fmt = sprintf("%%%d.%de", $len, $len - 6); + $str = sprintf($fmt, $val); + } else { + $fmt = sprintf("%%%d.%df", $len, $len - 2); + $str = sprintf($fmt, $val); + } + $str; +} + +# Input looks like +# "benchmark name +# size value +# .... +# <blank line> +# +# Return the biggest value before the blank line. +sub getbiggest +{ + local($msg) = @_; + local($line) = 0; + + undef $save; + $value = 0; + while (<FD>) { + $line++; + #warn "$line $_"; + last if /^\s*$/; + last if (!($_ =~ /^\d+/)); + $save = $_ if /^\d+\./; + } + if (defined $save) { + $_ = $save; + @d = split; + $value = $d[1]; + if (int($d[0]) < 4) { + warn "$file: using $d[0] size for $msg\n"; + } + } else { + warn "$file: no data for $msg\n"; + } + $value; +} + + +# Try and create sensible names from uname -a output +sub getos +{ + local(@info); + + @info = split(/\s+/, $_[0]); + "$info[3] $info[5]"; +} + +# Return true if the values differe by less than 10% +sub same +{ + local($a, $b) = @_; + + if ($a > $b) { + $percent = (($a - $b) / $a) * 100; + } else { + $percent = (($b - $a) / $b) * 100; + } + return ($percent <= 20); +} + +sub check_caches +{ + if (!&same($lat_l1[$i], $lat_l2[$i]) && + &same($lat_l2[$i], $lat_mem[$i])) { + " No L2 cache?"; + } elsif (&same($lat_l1[$i], $lat_l2[$i])) { + " No L1 cache?"; + } +} diff --git a/performance/lmbench3/scripts/gifs b/performance/lmbench3/scripts/gifs new file mode 100755 index 0000000..6691b58 --- /dev/null +++ b/performance/lmbench3/scripts/gifs @@ -0,0 +1,33 @@ + +# Make HTML files that will point to the right GIF files. +# Usage: bghtml file file file.... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1995 Larry McVoy. GPLed software. +# $Id: gifs 1.4 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +&pbm; +exit 0; + +sub pbm +{ + @ctx = <HTML/ctx*.pbm>; pop(@ctx); + @mem = <HTML/mem*.pbm>; pop(@mem); + @bar = <HTML/bar*.pbm>; pop(@bar); + + foreach $i (<HTML/*.pbm>) { + ($out = $i) =~ s/.pbm//; + warn "Bitmap munging $out\n"; + #system "pnmcrop < $i | ppmtogif -transparent 1,1,1 > $out"; + system " +pnmcrop < $i > HTML/___tmp 2>/dev/null +set `pnmfile HTML/___tmp` +newx=`expr \$4 - 2` +newy=`expr \$6 - 2` +pnmcut 1 1 \$newx \$newy < HTML/___tmp > HTML/___tmp.pnm +convert -mattecolor slategrey -frame 15x15+0+6 HTML/___tmp.pnm HTML/___tmp.ppm +ppmtogif < HTML/___tmp.ppm > $out.gif 2>/dev/null"; + } +} diff --git a/performance/lmbench3/scripts/gnu-os b/performance/lmbench3/scripts/gnu-os new file mode 100755 index 0000000..f2f8819 --- /dev/null +++ b/performance/lmbench3/scripts/gnu-os @@ -0,0 +1,1439 @@ +#! /bin/sh +# Attempt to guess a canonical system name. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +# 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + +timestamp='2004-08-18' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# Originally written by Per Bothner <per@xxxxxxxxxxx>. +# Please send patches to <config-patches@xxxxxxx>. Submit a context +# diff and a properly formatted ChangeLog entry. +# +# This script attempts to guess a canonical system name similar to +# config.sub. If it succeeds, it prints the system name on stdout, and +# exits with 0. Otherwise, it exits with 1. +# +# The plan is that this can be called by configure scripts if you +# don't specify an explicit build system type. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] + +Output the configuration name of the system \`$me' is run on. + +Operation modes: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to <config-patches@xxxxxxx>." + +version="\ +GNU config.guess ($timestamp) + +Originally written by Per Bothner. +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001 +Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try \`$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit 0 ;; + --version | -v ) + echo "$version" ; exit 0 ;; + --help | --h* | -h ) + echo "$usage"; exit 0 ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" >&2 + exit 1 ;; + * ) + break ;; + esac +done + +if test $# != 0; then + echo "$me: too many arguments$help" >&2 + exit 1 +fi + +trap 'exit 1' 1 2 15 + +for t in /usr/tmp /var/tmp /tmp; do + if [ -d $t -a -w $t ] + then TMPDIR=$t + break + fi +done + +# CC_FOR_BUILD -- compiler used by this script. Note that the use of a +# compiler to aid in system detection is discouraged as it requires +# temporary files to be created and, as you can see below, it is a +# headache to deal with in a portable fashion. + +# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still +# use `HOST_CC' if defined, but it is deprecated. + +# Portable tmp directory creation inspired by the Autoconf team. + +set_cc_for_build=' +trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; +trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; +: ${TMPDIR=/tmp} ; + { tmp=`(umask 077 && mktemp -d -q "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || + { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || + { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || + { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; +dummy=$tmp/dummy ; +tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; +case $CC_FOR_BUILD,$HOST_CC,$CC in + ,,) echo "int x;" > $dummy.c ; + for c in cc gcc c89 c99 ; do + if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then + CC_FOR_BUILD="$c"; break ; + fi ; + done ; + if test x"$CC_FOR_BUILD" = x ; then + CC_FOR_BUILD=no_compiler_found ; + fi + ;; + ,,*) CC_FOR_BUILD=$CC ;; + ,*,*) CC_FOR_BUILD=$HOST_CC ;; +esac ;' + +# This is needed to find uname on a Pyramid OSx when run in the BSD universe. +# (ghazi@xxxxxxxxxxxxxxx 1994-08-24) +if (test -f /.attbin/uname) >/dev/null 2>&1 ; then + PATH=$PATH:/.attbin ; export PATH +fi + +UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown +UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown +UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown +UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown + +# Note: order is significant - the case branches are not exclusive. + +case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in + *:NetBSD:*:*) + # NetBSD (nbsd) targets should (where applicable) match one or + # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*, + # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently + # switched to ELF, *-*-netbsd* would select the old + # object file format. This provides both forward + # compatibility and a consistent mechanism for selecting the + # object file format. + # + # Note: NetBSD doesn't particularly care about the vendor + # portion of the name. We always set it to "unknown". + sysctl="sysctl -n hw.machine_arch" + UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ + /usr/sbin/$sysctl 2>/dev/null || echo unknown)` + case "${UNAME_MACHINE_ARCH}" in + armeb) machine=armeb-unknown ;; + arm*) machine=arm-unknown ;; + sh3el) machine=shl-unknown ;; + sh3eb) machine=sh-unknown ;; + *) machine=${UNAME_MACHINE_ARCH}-unknown ;; + esac + # The Operating System including object format, if it has switched + # to ELF recently, or will in the future. + case "${UNAME_MACHINE_ARCH}" in + arm*|i386|m68k|ns32k|sh3*|sparc|vax) + eval $set_cc_for_build + if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep __ELF__ >/dev/null + then + # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). + # Return netbsd for either. FIX? + os=netbsd + else + os=netbsdelf + fi + ;; + *) + os=netbsd + ;; + esac + # The OS release + # Debian GNU/NetBSD machines have a different userland, and + # thus, need a distinct triplet. However, they do not need + # kernel version information, so it can be replaced with a + # suitable tag, in the style of linux-gnu. + case "${UNAME_VERSION}" in + Debian*) + release='-gnu' + ;; + *) + release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` + ;; + esac + # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: + # contains redundant information, the shorter form: + # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. + echo "${machine}-${os}${release}" + exit 0 ;; + amiga:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + arc:OpenBSD:*:*) + echo mipsel-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + hp300:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + mac68k:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + macppc:OpenBSD:*:*) + echo powerpc-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + mvme68k:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + mvme88k:OpenBSD:*:*) + echo m88k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + mvmeppc:OpenBSD:*:*) + echo powerpc-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + pegasos:OpenBSD:*:*) + echo powerpc-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + pmax:OpenBSD:*:*) + echo mipsel-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + sgi:OpenBSD:*:*) + echo mipseb-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + sun3:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + wgrisc:OpenBSD:*:*) + echo mipsel-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + *:OpenBSD:*:*) + echo ${UNAME_MACHINE}-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + alpha:OSF1:*:*) + if test $UNAME_RELEASE = "V4.0"; then + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` + fi + # According to Compaq, /usr/sbin/psrinfo has been available on + # OSF/1 and Tru64 systems produced since 1995. I hope that + # covers most systems running today. This code pipes the CPU + # types through head -n 1, so we only detect the type of CPU 0. + ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` + case "$ALPHA_CPU_TYPE" in + "EV4 (21064)") + UNAME_MACHINE="alpha" ;; + "EV4.5 (21064)") + UNAME_MACHINE="alpha" ;; + "LCA4 (21066/21068)") + UNAME_MACHINE="alpha" ;; + "EV5 (21164)") + UNAME_MACHINE="alphaev5" ;; + "EV5.6 (21164A)") + UNAME_MACHINE="alphaev56" ;; + "EV5.6 (21164PC)") + UNAME_MACHINE="alphapca56" ;; + "EV5.7 (21164PC)") + UNAME_MACHINE="alphapca57" ;; + "EV6 (21264)") + UNAME_MACHINE="alphaev6" ;; + "EV6.7 (21264A)") + UNAME_MACHINE="alphaev67" ;; + "EV6.8CB (21264C)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8AL (21264B)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8CX (21264D)") + UNAME_MACHINE="alphaev68" ;; + "EV6.9A (21264/EV69A)") + UNAME_MACHINE="alphaev69" ;; + "EV7 (21364)") + UNAME_MACHINE="alphaev7" ;; + "EV7.9 (21364A)") + UNAME_MACHINE="alphaev79" ;; + esac + # A Vn.n version is a released version. + # A Tn.n version is a released field test version. + # A Xn.n version is an unreleased experimental baselevel. + # 1.2 uses "1.2" for uname -r. + echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + exit 0 ;; + Alpha*:OpenVMS:*:*) + echo alpha-hp-vms + exit 0 ;; + Alpha\ *:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # Should we change UNAME_MACHINE based on the output of uname instead + # of the specific Alpha model? + echo alpha-pc-interix + exit 0 ;; + 21064:Windows_NT:50:3) + echo alpha-dec-winnt3.5 + exit 0 ;; + Amiga*:UNIX_System_V:4.0:*) + echo m68k-unknown-sysv4 + exit 0;; + *:[Aa]miga[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-amigaos + exit 0 ;; + *:[Mm]orph[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-morphos + exit 0 ;; + *:OS/390:*:*) + echo i370-ibm-openedition + exit 0 ;; + *:OS400:*:*) + echo powerpc-ibm-os400 + exit 0 ;; + arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) + echo arm-acorn-riscix${UNAME_RELEASE} + exit 0;; + SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) + echo hppa1.1-hitachi-hiuxmpp + exit 0;; + Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) + # akee@xxxxxxxxxxxxxxxxxxxx (Earle F. Ake) contributed MIS and NILE. + if test "`(/bin/universe) 2>/dev/null`" = att ; then + echo pyramid-pyramid-sysv3 + else + echo pyramid-pyramid-bsd + fi + exit 0 ;; + NILE*:*:*:dcosx) + echo pyramid-pyramid-svr4 + exit 0 ;; + DRS?6000:unix:4.0:6*) + echo sparc-icl-nx6 + exit 0 ;; + DRS?6000:UNIX_SV:4.2*:7*) + case `/usr/bin/uname -p` in + sparc) echo sparc-icl-nx7 && exit 0 ;; + esac ;; + sun4H:SunOS:5.*:*) + echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) + echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + i86pc:SunOS:5.*:*) + echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + sun4*:SunOS:6*:*) + # According to config.sub, this is the proper way to canonicalize + # SunOS6. Hard to guess exactly what SunOS6 will be like, but + # it's likely to be more like Solaris than SunOS4. + echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + sun4*:SunOS:*:*) + case "`/usr/bin/arch -k`" in + Series*|S4*) + UNAME_RELEASE=`uname -v` + ;; + esac + # Japanese Language versions have a version number like `4.1.3-JL'. + echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` + exit 0 ;; + sun3*:SunOS:*:*) + echo m68k-sun-sunos${UNAME_RELEASE} + exit 0 ;; + sun*:*:4.2BSD:*) + UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` + test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 + case "`/bin/arch`" in + sun3) + echo m68k-sun-sunos${UNAME_RELEASE} + ;; + sun4) + echo sparc-sun-sunos${UNAME_RELEASE} + ;; + esac + exit 0 ;; + aushp:SunOS:*:*) + echo sparc-auspex-sunos${UNAME_RELEASE} + exit 0 ;; + # The situation for MiNT is a little confusing. The machine name + # can be virtually everything (everything which is not + # "atarist" or "atariste" at least should have a processor + # > m68000). The system name ranges from "MiNT" over "FreeMiNT" + # to the lowercase version "mint" (or "freemint"). Finally + # the system name "TOS" denotes a system which is actually not + # MiNT. But MiNT is downward compatible to TOS, so this should + # be no problem. + atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit 0 ;; + atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit 0 ;; + *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit 0 ;; + milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) + echo m68k-milan-mint${UNAME_RELEASE} + exit 0 ;; + hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) + echo m68k-hades-mint${UNAME_RELEASE} + exit 0 ;; + *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) + echo m68k-unknown-mint${UNAME_RELEASE} + exit 0 ;; + powerpc:machten:*:*) + echo powerpc-apple-machten${UNAME_RELEASE} + exit 0 ;; + RISC*:Mach:*:*) + echo mips-dec-mach_bsd4.3 + exit 0 ;; + RISC*:ULTRIX:*:*) + echo mips-dec-ultrix${UNAME_RELEASE} + exit 0 ;; + VAX*:ULTRIX*:*:*) + echo vax-dec-ultrix${UNAME_RELEASE} + exit 0 ;; + 2020:CLIX:*:* | 2430:CLIX:*:*) + echo clipper-intergraph-clix${UNAME_RELEASE} + exit 0 ;; + mips:*:*:UMIPS | mips:*:*:RISCos) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c +#ifdef __cplusplus +#include <stdio.h> /* for printf() prototype */ + int main (int argc, char *argv[]) { +#else + int main (argc, argv) int argc; char *argv[]; { +#endif + #if defined (host_mips) && defined (MIPSEB) + #if defined (SYSTYPE_SYSV) + printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_SVR4) + printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) + printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); + #endif + #endif + exit (-1); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c \ + && $dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \ + && exit 0 + echo mips-mips-riscos${UNAME_RELEASE} + exit 0 ;; + Motorola:PowerMAX_OS:*:*) + echo powerpc-motorola-powermax + exit 0 ;; + Motorola:*:4.3:PL8-*) + echo powerpc-harris-powermax + exit 0 ;; + Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) + echo powerpc-harris-powermax + exit 0 ;; + Night_Hawk:Power_UNIX:*:*) + echo powerpc-harris-powerunix + exit 0 ;; + m88k:CX/UX:7*:*) + echo m88k-harris-cxux7 + exit 0 ;; + m88k:*:4*:R4*) + echo m88k-motorola-sysv4 + exit 0 ;; + m88k:*:3*:R3*) + echo m88k-motorola-sysv3 + exit 0 ;; + AViiON:dgux:*:*) + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` + if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] + then + if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ + [ ${TARGET_BINARY_INTERFACE}x = x ] + then + echo m88k-dg-dgux${UNAME_RELEASE} + else + echo m88k-dg-dguxbcs${UNAME_RELEASE} + fi + else + echo i586-dg-dgux${UNAME_RELEASE} + fi + exit 0 ;; + M88*:DolphinOS:*:*) # DolphinOS (SVR3) + echo m88k-dolphin-sysv3 + exit 0 ;; + M88*:*:R3*:*) + # Delta 88k system running SVR3 + echo m88k-motorola-sysv3 + exit 0 ;; + XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) + echo m88k-tektronix-sysv3 + exit 0 ;; + Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) + echo m68k-tektronix-bsd + exit 0 ;; + *:IRIX*:*:*) + echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` + exit 0 ;; + ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. + echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id + exit 0 ;; # Note that: echo "'`uname -s`'" gives 'AIX ' + i*86:AIX:*:*) + echo i386-ibm-aix + exit 0 ;; + ia64:AIX:*:*) + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} + exit 0 ;; + *:AIX:2:3) + if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include <sys/systemcfg.h> + + main() + { + if (!__power_pc()) + exit(1); + puts("powerpc-ibm-aix3.2.5"); + exit(0); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && $dummy && exit 0 + echo rs6000-ibm-aix3.2.5 + elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then + echo rs6000-ibm-aix3.2.4 + else + echo rs6000-ibm-aix3.2 + fi + exit 0 ;; + *:AIX:*:[45]) + IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` + if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then + IBM_ARCH=rs6000 + else + IBM_ARCH=powerpc + fi + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${IBM_ARCH}-ibm-aix${IBM_REV} + exit 0 ;; + *:AIX:*:*) + echo rs6000-ibm-aix + exit 0 ;; + ibmrt:4.4BSD:*|romp-ibm:BSD:*) + echo romp-ibm-bsd4.4 + exit 0 ;; + ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and + echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to + exit 0 ;; # report: romp-ibm BSD 4.3 + *:BOSX:*:*) + echo rs6000-bull-bosx + exit 0 ;; + DPX/2?00:B.O.S.:*:*) + echo m68k-bull-sysv3 + exit 0 ;; + 9000/[34]??:4.3bsd:1.*:*) + echo m68k-hp-bsd + exit 0 ;; + hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) + echo m68k-hp-bsd4.4 + exit 0 ;; + 9000/[34678]??:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + case "${UNAME_MACHINE}" in + 9000/31? ) HP_ARCH=m68000 ;; + 9000/[34]?? ) HP_ARCH=m68k ;; + 9000/[678][0-9][0-9]) + if [ -x /usr/bin/getconf ]; then + sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` + sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` + case "${sc_cpu_version}" in + 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 + 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 + 532) # CPU_PA_RISC2_0 + case "${sc_kernel_bits}" in + 32) HP_ARCH="hppa2.0n" ;; + 64) HP_ARCH="hppa2.0w" ;; + '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 + esac ;; + esac + fi + if [ "${HP_ARCH}" = "" ]; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + + #define _HPUX_SOURCE + #include <stdlib.h> + #include <unistd.h> + + int main () + { + #if defined(_SC_KERNEL_BITS) + long bits = sysconf(_SC_KERNEL_BITS); + #endif + long cpu = sysconf (_SC_CPU_VERSION); + + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1"); break; + case CPU_PA_RISC2_0: + #if defined(_SC_KERNEL_BITS) + switch (bits) + { + case 64: puts ("hppa2.0w"); break; + case 32: puts ("hppa2.0n"); break; + default: puts ("hppa2.0"); break; + } break; + #else /* !defined(_SC_KERNEL_BITS) */ + puts ("hppa2.0"); break; + #endif + default: puts ("hppa1.0"); break; + } + exit (0); + } +EOF + (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` + test -z "$HP_ARCH" && HP_ARCH=hppa + fi ;; + esac + if [ ${HP_ARCH} = "hppa2.0w" ] + then + # avoid double evaluation of $set_cc_for_build + test -n "$CC_FOR_BUILD" || eval $set_cc_for_build + if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E -) | grep __LP64__ >/dev/null + then + HP_ARCH="hppa2.0w" + else + HP_ARCH="hppa64" + fi + fi + echo ${HP_ARCH}-hp-hpux${HPUX_REV} + exit 0 ;; + ia64:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + echo ia64-hp-hpux${HPUX_REV} + exit 0 ;; + 3050*:HI-UX:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include <unistd.h> + int + main () + { + long cpu = sysconf (_SC_CPU_VERSION); + /* The order matters, because CPU_IS_HP_MC68K erroneously returns + true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct + results, however. */ + if (CPU_IS_PA_RISC (cpu)) + { + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; + case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; + default: puts ("hppa-hitachi-hiuxwe2"); break; + } + } + else if (CPU_IS_HP_MC68K (cpu)) + puts ("m68k-hitachi-hiuxwe2"); + else puts ("unknown-hitachi-hiuxwe2"); + exit (0); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && $dummy && exit 0 + echo unknown-hitachi-hiuxwe2 + exit 0 ;; + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) + echo hppa1.1-hp-bsd + exit 0 ;; + 9000/8??:4.3bsd:*:*) + echo hppa1.0-hp-bsd + exit 0 ;; + *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) + echo hppa1.0-hp-mpeix + exit 0 ;; + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) + echo hppa1.1-hp-osf + exit 0 ;; + hp8??:OSF1:*:*) + echo hppa1.0-hp-osf + exit 0 ;; + i*86:OSF1:*:*) + if [ -x /usr/sbin/sysversion ] ; then + echo ${UNAME_MACHINE}-unknown-osf1mk + else + echo ${UNAME_MACHINE}-unknown-osf1 + fi + exit 0 ;; + parisc*:Lites*:*:*) + echo hppa1.1-hp-lites + exit 0 ;; + C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) + echo c1-convex-bsd + exit 0 ;; + C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit 0 ;; + C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) + echo c34-convex-bsd + exit 0 ;; + C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) + echo c38-convex-bsd + exit 0 ;; + C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) + echo c4-convex-bsd + exit 0 ;; + CRAY*Y-MP:*:*:*) + echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit 0 ;; + CRAY*[A-Z]90:*:*:*) + echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ + | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ + -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ + -e 's/\.[^.]*$/.X/' + exit 0 ;; + CRAY*TS:*:*:*) + echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit 0 ;; + CRAY*T3E:*:*:*) + echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit 0 ;; + CRAY*SV1:*:*:*) + echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit 0 ;; + *:UNICOS/mp:*:*) + echo nv1-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit 0 ;; + F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) + FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` + echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit 0 ;; + 5000:UNIX_System_V:4.*:*) + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` + echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit 0 ;; + i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) + echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} + exit 0 ;; + sparc*:BSD/OS:*:*) + echo sparc-unknown-bsdi${UNAME_RELEASE} + exit 0 ;; + *:BSD/OS:*:*) + echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} + exit 0 ;; + *:FreeBSD:*:*) + # Determine whether the default compiler uses glibc. + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include <features.h> + #if __GLIBC__ >= 2 + LIBC=gnu + #else + LIBC= + #endif +EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=` + # GNU/KFreeBSD systems have a "k" prefix to indicate we are using + # FreeBSD's kernel, but not the complete OS. + case ${LIBC} in gnu) kernel_only='k' ;; esac + echo ${UNAME_MACHINE}-unknown-${kernel_only}freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`${LIBC:+-$LIBC} + exit 0 ;; + i*:CYGWIN*:*) + echo ${UNAME_MACHINE}-pc-cygwin + exit 0 ;; + i*:MINGW*:*) + echo ${UNAME_MACHINE}-pc-mingw32 + exit 0 ;; + i*:PW*:*) + echo ${UNAME_MACHINE}-pc-pw32 + exit 0 ;; + x86:Interix*:[34]*) + echo i586-pc-interix${UNAME_RELEASE}|sed -e 's/\..*//' + exit 0 ;; + [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) + echo i${UNAME_MACHINE}-pc-mks + exit 0 ;; + i*:Windows_NT*:* | Pentium*:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we + # UNAME_MACHINE based on the output of uname instead of i386? + echo i586-pc-interix + exit 0 ;; + i*:UWIN*:*) + echo ${UNAME_MACHINE}-pc-uwin + exit 0 ;; + p*:CYGWIN*:*) + echo powerpcle-unknown-cygwin + exit 0 ;; + prep*:SunOS:5.*:*) + echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + *:GNU:*:*) + # the GNU system + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + exit 0 ;; + *:GNU/*:*:*) + # other systems with GNU libc and userland + echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu + exit 0 ;; + i*86:Minix:*:*) + echo ${UNAME_MACHINE}-pc-minix + exit 0 ;; + arm*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit 0 ;; + cris:Linux:*:*) + echo cris-axis-linux-gnu + exit 0 ;; + ia64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit 0 ;; + m68*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit 0 ;; + mips:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU + #undef mips + #undef mipsel + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + CPU=mipsel + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + CPU=mips + #else + CPU= + #endif + #endif +EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=` + test x"${CPU}" != x && echo "${CPU}-unknown-linux-gnu" && exit 0 + ;; + mips64:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU + #undef mips64 + #undef mips64el + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + CPU=mips64el + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + CPU=mips64 + #else + CPU= + #endif + #endif +EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=` + test x"${CPU}" != x && echo "${CPU}-unknown-linux-gnu" && exit 0 + ;; + ppc:Linux:*:*) + echo powerpc-unknown-linux-gnu + exit 0 ;; + ppc64:Linux:*:*) + echo powerpc64-unknown-linux-gnu + exit 0 ;; + alpha:Linux:*:*) + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in + EV5) UNAME_MACHINE=alphaev5 ;; + EV56) UNAME_MACHINE=alphaev56 ;; + PCA56) UNAME_MACHINE=alphapca56 ;; + PCA57) UNAME_MACHINE=alphapca56 ;; + EV6) UNAME_MACHINE=alphaev6 ;; + EV67) UNAME_MACHINE=alphaev67 ;; + EV68*) UNAME_MACHINE=alphaev68 ;; + esac + objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null + if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi + echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + exit 0 ;; + parisc:Linux:*:* | hppa:Linux:*:*) + # Look for CPU level + case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in + PA7*) echo hppa1.1-unknown-linux-gnu ;; + PA8*) echo hppa2.0-unknown-linux-gnu ;; + *) echo hppa-unknown-linux-gnu ;; + esac + exit 0 ;; + parisc64:Linux:*:* | hppa64:Linux:*:*) + echo hppa64-unknown-linux-gnu + exit 0 ;; + s390:Linux:*:* | s390x:Linux:*:*) + echo ${UNAME_MACHINE}-ibm-linux + exit 0 ;; + sh64*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit 0 ;; + sh*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit 0 ;; + sparc:Linux:*:* | sparc64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit 0 ;; + x86_64:Linux:*:*) + echo x86_64-unknown-linux-gnu + exit 0 ;; + i*86:Linux:*:*) + # The BFD linker knows what the default object file format is, so + # first see if it will tell us. cd to the root directory to prevent + # problems with other programs or directories called `ld' in the path. + # Set LC_ALL=C to ensure ld outputs messages in English. + ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \ + | sed -ne '/supported targets:/!d + s/[ ][ ]*/ /g + s/.*supported targets: *// + s/ .*// + p'` + case "$ld_supported_targets" in + elf32-i386) + TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu" + ;; + a.out-i386-linux) + echo "${UNAME_MACHINE}-pc-linux-gnuaout" + exit 0 ;; + coff-i386) + echo "${UNAME_MACHINE}-pc-linux-gnucoff" + exit 0 ;; + "") + # Either a pre-BFD a.out linker (linux-gnuoldld) or + # one that does not give us useful --help. + echo "${UNAME_MACHINE}-pc-linux-gnuoldld" + exit 0 ;; + esac + # Determine whether the default compiler is a.out or elf + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include <features.h> + #ifdef __ELF__ + # ifdef __GLIBC__ + # if __GLIBC__ >= 2 + LIBC=gnu + # else + LIBC=gnulibc1 + # endif + # else + LIBC=gnulibc1 + # endif + #else + #ifdef __INTEL_COMPILER + LIBC=gnu + #else + LIBC=gnuaout + #endif + #endif + #ifdef __dietlibc__ + LIBC=dietlibc + #endif +EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=` + test x"${LIBC}" != x && echo "${UNAME_MACHINE}-pc-linux-${LIBC}" && exit 0 + test x"${TENTATIVE}" != x && echo "${TENTATIVE}" && exit 0 + ;; + i*86:DYNIX/ptx:4*:*) + # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. + # earlier versions are messed up and put the nodename in both + # sysname and nodename. + echo i386-sequent-sysv4 + exit 0 ;; + i*86:UNIX_SV:4.2MP:2.*) + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... + # I am not positive that other SVR4 systems won't match this, + # I just have to hope. -- rms. + # Use sysv4.2uw... so that sysv4* matches it. + echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} + exit 0 ;; + i*86:OS/2:*:*) + # If we were able to find `uname', then EMX Unix compatibility + # is probably installed. + echo ${UNAME_MACHINE}-pc-os2-emx + exit 0 ;; + i*86:XTS-300:*:STOP) + echo ${UNAME_MACHINE}-unknown-stop + exit 0 ;; + i*86:atheos:*:*) + echo ${UNAME_MACHINE}-unknown-atheos + exit 0 ;; + i*86:syllable:*:*) + echo ${UNAME_MACHINE}-pc-syllable + exit 0 ;; + i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*) + echo i386-unknown-lynxos${UNAME_RELEASE} + exit 0 ;; + i*86:*DOS:*:*) + echo ${UNAME_MACHINE}-pc-msdosdjgpp + exit 0 ;; + i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) + UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` + if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then + echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} + else + echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} + fi + exit 0 ;; + i*86:*:5:[78]*) + case `/bin/uname -X | grep "^Machine"` in + *486*) UNAME_MACHINE=i486 ;; + *Pentium) UNAME_MACHINE=i586 ;; + *Pent*|*Celeron) UNAME_MACHINE=i686 ;; + esac + echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} + exit 0 ;; + i*86:*:3.2:*) + if test -f /usr/options/cb.name; then + UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name` + echo ${UNAME_MACHINE}-pc-isc$UNAME_REL + elif /bin/uname -X 2>/dev/null >/dev/null ; then + UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` + (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 + (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ + && UNAME_MACHINE=i586 + (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ + && UNAME_MACHINE=i686 + (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ + && UNAME_MACHINE=i686 + echo ${UNAME_MACHINE}-pc-sco$UNAME_REL + else + echo ${UNAME_MACHINE}-pc-sysv32 + fi + exit 0 ;; + pc:*:*:*) + # Left here for compatibility: + # uname -m prints for DJGPP always 'pc', but it prints nothing about + # the processor, so we play safe by assuming i386. + echo i386-pc-msdosdjgpp + exit 0 ;; + Intel:Mach:3*:*) + echo i386-pc-mach3 + exit 0 ;; + paragon:*:*:*) + echo i860-intel-osf1 + exit 0 ;; + i860:*:4.*:*) # i860-SVR4 + if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then + echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 + else # Add other i860-SVR4 vendors below as they are discovered. + echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 + fi + exit 0 ;; + mini*:CTIX:SYS*5:*) + # "miniframe" + echo m68010-convergent-sysv + exit 0 ;; + mc68k:UNIX:SYSTEM5:3.51m) + echo m68k-convergent-sysv + exit 0 ;; + M680?0:D-NIX:5.3:*) + echo m68k-diab-dnix + exit 0 ;; + M68*:*:R3V[567]*:*) + test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;; + 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0) + OS_REL='' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && echo i486-ncr-sysv4.3${OS_REL} && exit 0 + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;; + 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && echo i486-ncr-sysv4 && exit 0 ;; + m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) + echo m68k-unknown-lynxos${UNAME_RELEASE} + exit 0 ;; + mc68030:UNIX_System_V:4.*:*) + echo m68k-atari-sysv4 + exit 0 ;; + TSUNAMI:LynxOS:2.*:*) + echo sparc-unknown-lynxos${UNAME_RELEASE} + exit 0 ;; + rs6000:LynxOS:2.*:*) + echo rs6000-unknown-lynxos${UNAME_RELEASE} + exit 0 ;; + PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*) + echo powerpc-unknown-lynxos${UNAME_RELEASE} + exit 0 ;; + SM[BE]S:UNIX_SV:*:*) + echo mips-dde-sysv${UNAME_RELEASE} + exit 0 ;; + RM*:ReliantUNIX-*:*:*) + echo mips-sni-sysv4 + exit 0 ;; + RM*:SINIX-*:*:*) + echo mips-sni-sysv4 + exit 0 ;; + *:SINIX-*:*:*) + if uname -p 2>/dev/null >/dev/null ; then + UNAME_MACHINE=`(uname -p) 2>/dev/null` + echo ${UNAME_MACHINE}-sni-sysv4 + else + echo ns32k-sni-sysv + fi + exit 0 ;; + PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort + # says <Richard.M.Bartel@xxxxxxxxxxxxxxxxx> + echo i586-unisys-sysv4 + exit 0 ;; + *:UNIX_System_V:4*:FTX*) + # From Gerald Hewes <hewes@xxxxxxxxxxxxxx>. + # How about differentiating between stratus architectures? -djm + echo hppa1.1-stratus-sysv4 + exit 0 ;; + *:*:*:FTX*) + # From seanf@xxxxxxxxxxxxxxxx. + echo i860-stratus-sysv4 + exit 0 ;; + *:VOS:*:*) + # From Paul.Green@xxxxxxxxxxx. + echo hppa1.1-stratus-vos + exit 0 ;; + mc68*:A/UX:*:*) + echo m68k-apple-aux${UNAME_RELEASE} + exit 0 ;; + news*:NEWS-OS:6*:*) + echo mips-sony-newsos6 + exit 0 ;; + R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) + if [ -d /usr/nec ]; then + echo mips-nec-sysv${UNAME_RELEASE} + else + echo mips-unknown-sysv${UNAME_RELEASE} + fi + exit 0 ;; + BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. + echo powerpc-be-beos + exit 0 ;; + BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. + echo powerpc-apple-beos + exit 0 ;; + BePC:BeOS:*:*) # BeOS running on Intel PC compatible. + echo i586-pc-beos + exit 0 ;; + SX-4:SUPER-UX:*:*) + echo sx4-nec-superux${UNAME_RELEASE} + exit 0 ;; + SX-5:SUPER-UX:*:*) + echo sx5-nec-superux${UNAME_RELEASE} + exit 0 ;; + SX-6:SUPER-UX:*:*) + echo sx6-nec-superux${UNAME_RELEASE} + exit 0 ;; + Power*:Rhapsody:*:*) + echo powerpc-apple-rhapsody${UNAME_RELEASE} + exit 0 ;; + *:Rhapsody:*:*) + echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} + exit 0 ;; + *:Darwin:*:*) + case `uname -p` in + *86) UNAME_PROCESSOR=i686 ;; + powerpc) UNAME_PROCESSOR=powerpc ;; + esac + echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} + exit 0 ;; + *:procnto*:*:* | *:QNX:[0123456789]*:*) + UNAME_PROCESSOR=`uname -p` + if test "$UNAME_PROCESSOR" = "x86"; then + UNAME_PROCESSOR=i386 + UNAME_MACHINE=pc + fi + echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} + exit 0 ;; + *:QNX:*:4*) + echo i386-pc-qnx + exit 0 ;; + NSR-?:NONSTOP_KERNEL:*:*) + echo nsr-tandem-nsk${UNAME_RELEASE} + exit 0 ;; + *:NonStop-UX:*:*) + echo mips-compaq-nonstopux + exit 0 ;; + BS2000:POSIX*:*:*) + echo bs2000-siemens-sysv + exit 0 ;; + DS/*:UNIX_System_V:*:*) + echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} + exit 0 ;; + *:Plan9:*:*) + # "uname -m" is not consistent, so use $cputype instead. 386 + # is converted to i386 for consistency with other x86 + # operating systems. + if test "$cputype" = "386"; then + UNAME_MACHINE=i386 + else + UNAME_MACHINE="$cputype" + fi + echo ${UNAME_MACHINE}-unknown-plan9 + exit 0 ;; + *:TOPS-10:*:*) + echo pdp10-unknown-tops10 + exit 0 ;; + *:TENEX:*:*) + echo pdp10-unknown-tenex + exit 0 ;; + KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) + echo pdp10-dec-tops20 + exit 0 ;; + XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) + echo pdp10-xkl-tops20 + exit 0 ;; + *:TOPS-20:*:*) + echo pdp10-unknown-tops20 + exit 0 ;; + *:ITS:*:*) + echo pdp10-unknown-its + exit 0 ;; + SEI:*:*:SEIUX) + echo mips-sei-seiux${UNAME_RELEASE} + exit 0 ;; + *:DRAGONFLY:*:*) + echo ${UNAME_MACHINE}-unknown-dragonfly${UNAME_RELEASE} + exit 0 ;; +esac + +#echo '(No uname command or uname output not recognized.)' 1>&2 +#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2 + +eval $set_cc_for_build +cat >$dummy.c <<EOF +#ifdef _SEQUENT_ +# include <sys/types.h> +# include <sys/utsname.h> +#endif +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include <sys/param.h> + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (__arm) && defined (__acorn) && defined (__unix) + printf ("arm-acorn-riscix"); exit (0); +#endif + +#if defined (hp300) && !defined (hpux) + printf ("m68k-hp-bsd\n"); exit (0); +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + if (version < 4) + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + else + printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); + +#endif + +#if defined (vax) +# if !defined (ultrix) +# include <sys/param.h> +# if defined (BSD) +# if BSD == 43 + printf ("vax-dec-bsd4.3\n"); exit (0); +# else +# if BSD == 199006 + printf ("vax-dec-bsd4.3reno\n"); exit (0); +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# endif +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# else + printf ("vax-dec-ultrix\n"); exit (0); +# endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && $dummy && exit 0 + +# Apollos put the system type in the environment. + +test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; } + +# Convex versions that predate uname can use getsysinfo(1) + +if [ -x /usr/convex/getsysinfo ] +then + case `getsysinfo -f cpu_type` in + c1*) + echo c1-convex-bsd + exit 0 ;; + c2*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit 0 ;; + c34*) + echo c34-convex-bsd + exit 0 ;; + c38*) + echo c38-convex-bsd + exit 0 ;; + c4*) + echo c4-convex-bsd + exit 0 ;; + esac +fi + +cat >&2 <<EOF +$0: unable to guess system type + +This script, last modified $timestamp, has failed to recognize +the operating system you are using. It is advised that you +download the most up to date version of the config scripts from + + ftp://ftp.gnu.org/pub/gnu/config/ + +If the version you run ($0) is already up to date, please +send the following data and any information you think might be +pertinent to <config-patches@xxxxxxx> in order to provide the needed +information to handle your system. + +config.guess timestamp = $timestamp + +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null` + +hostinfo = `(hostinfo) 2>/dev/null` +/bin/universe = `(/bin/universe) 2>/dev/null` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` +/bin/arch = `(/bin/arch) 2>/dev/null` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` + +UNAME_MACHINE = ${UNAME_MACHINE} +UNAME_RELEASE = ${UNAME_RELEASE} +UNAME_SYSTEM = ${UNAME_SYSTEM} +UNAME_VERSION = ${UNAME_VERSION} +EOF + +exit 1 + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/performance/lmbench3/scripts/graph b/performance/lmbench3/scripts/graph new file mode 100755 index 0000000..63cbefc --- /dev/null +++ b/performance/lmbench3/scripts/graph @@ -0,0 +1,947 @@ + +# $Id: graph 1.12 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $ +eval "exec perl -Ss $0 $@" + if 0; + +# A graphing preprocessor for GNU pic / troff package. +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# +# Input format is like that of Xgraph, i.e., sets of X Y pairs, +# divided up by blank lines and titled with a "title. Like so +# +# 1 1 +# 2 2 +# "straight slope +# +# 4 4 +# 1 4 +# "straight down +# +# Optional "quartile" data input format. +# The drawing is ----- o ---, with the lines being from y1..y2, y4..y5, +# and the mark at y3. +# +# x y1 y2 y3 y4 y5 +# x y1 y2 y3 y4 y5 +# x y1 y2 y3 y4 y5 +# +# Optional input (superset of Xgraph) is like so: +# +# %T Graph title in +4 point font +# %X X axis title and/or units in +2 point font +# %Y Y axis title and/or units in +2 point font +# %P Page title in +4 point font +# %fakemax-X <value> force graph to be that big +# %fakemax-Y <value> force graph to be that big +# %fakemin-X <value> force graph to be that big +# %fakemin-Y <value> force graph to be that big +# +# Options: +# -lm implies -big -below -grid -close +# -rev reverse X/Y data sense (and titles) +# -below put data set titles below the graph rather than to the right +# -close no extra space around the data +# -qline connect the quartile center points +# -grid grid :-) +# -halfgrid Grid lines where the major ticks are +# -nobox no box around whole graph +# -big make the graph take the whole page +# -slide make the graph fit in my slides +# -small make the graph be small so you can do a lot of them. +# -notitle no Title label +# -nolabels no X/Y/Title labels +# -nodatal no dataset labels +# -nomarks no marks on the graphs. +# -nolines no lines connecting the marks (don't use w/ -nomarks :-) +# -k print (absolute) values larger than 1000 as (value/1000)K +# -grapheach graph each data set separately +# -br_title start a new graph at each title. +# -nospace no .sp at top of picture +# -ts time series, X axis is implied. +# -hist produce a histogram graph +# +# Hacks :-) +# -xk multiply X input by 1024. +# -xm multiply X input by 1024*1024. +# -logx take the log base 2 of X input +# -logy take the log base 2 of Y input +# -cut add cut marks so that image croppers dont crop too close +# +# Much thanks to James Clark for providing such a nice replacement for +# the Unix troff package. Thanks to the Xgraph folks for providing +# inspiration. Thanks to Declan Murphy for math :-) +# Thanks to noone for floating point numbers, they suck dog doo. +# There are lots of hacks in here to deal with rounding errors. +# +# TODO: +# All of the option parsing done manually. +# A filter option to print ranges of the data? +# A way to do each data set in it's own graph. +# All of the other xgraph options? +# For Adam, that butthead, an option to sort the labels such that they +# are in the same order as the right endpoints of the data sets. + +&init; +&autosize; +&pic; +exit; + +# init - slurp in the data and apply any transformations. +sub init +{ + # Lint for the options. + $qline = $ts = $close = $nolines = $thk1 = $thk2 = $k = $notitle + = $thk1_5 = $xm = $grid = $nospace = $lm = $hist = 0 if 0; + + if ($grapheach) { $grapheach = 1; $cut = 0; } else { $grapheach = 0; } + if ($halfgrid) { $halfgrid = 1; } else { $halfgrid = 0; } + if ($hist) { $nobox = 1; $nolabels = 1; $close = 1; $nolines = 1; } + if ($lm) { $big = $below = $grid = $close = 1; } + + # Accept %options=value on the command line. + while ($ARGV[0] =~ /^%/) { + $_ = $ARGV[0]; + s/=/ /; + push(@lines, "$_\n"); + shift(@ARGV); + } + + # OK, sometimes we get + # %T title + # %X X axis, etc. + # + # "data set 1 + # + # And this messes up the numbering later on. So we carefully dump the + # whitespace between the control and data. + while (<>) { + last if /^\s*$/; + push(@lines, $_); + last if /^"/; + last if /^\d/; + } + push(@lines, <>); + $fake = ""; + $items = 0; + $stat_sum = 0; + $min = 1.7E+308; + $max = 2.2E-308; + foreach (@lines) { + if (/^"?%fake/) { + $fake = $_; + s/"?%fakemax-//; + s/"?%fakemin-//; + @_ = split; + $_ = "$_[1] $_[1]"; + } elsif (/^%hist\s/) { + split; + shift(@_); + ($hist_bsize, $hist_low, $hist_high) = @_; + next; + } else { + next if /^\s*["%#]/; + next if /^\s*$/; + } + if ($ts) { + $_ = "$items $_"; + } + $items++; + @_ = split; + if ($xk) { + $_[0] = $_[0] * 1024; + } elsif ($xm) { + $_[0] = $_[0] * 1024 * 1024; + } + if ($logx) { + $_[0] = &logbase(2, $_[0]); + } + if ($yk) { + $_[1] = $_[1] * 1024; + } elsif ($ym) { + $_[1] = $_[1] * 1024 * 1024; + } + if ($logy) { + $_[1] = &logbase(2, $_[1]); + } + if ($rev) { + $_ = "$_[1] $_[0]"; + $y = $_[0]; + } else { + $_ = "$_[0] $_[1]"; + $y = $_[1]; + } + $stat_sum += $y; + $max = $y if ($y > $max); + $min = $y if ($y < $min); + push(@y, $y); + if ($fake =~ /[XY]/) { + # XXX - reverse? What should it do? + if ($fake =~ /fakemax-X/) { + $fakemax_X = $_[0]; + } elsif ($fake =~ /fakemax-Y/) { + $fakemax_Y = $_[1]; + } elsif ($fake =~ /fakemin-X/) { + $fakemin_X = $_[0]; + } elsif ($fake =~ /fakemin-Y/) { + $fakemin_Y = $_[1]; + } + $_ = $fake; + $fake = ""; + } + } + + # Do some statistics. + @s = sort(@y); + if ($items & 1) { + $stat_median = $s[($items + 1)/2]; + } else { + $i = $items / 2; + $stat_median = ($s[$i] + $s[$i+1]) / 2; + } + $stat_avg = $stat_sum/$items; + $stat_avgdev = $stat_var = 0; + # $stat_skew = $stat_curt = 0; + foreach $_ (@lines) { + next if /^\s*["#%]/; + next if /^\s*$/; + @_ = split; + $stat_var += ($_[1] - $stat_median) ** 2; + $tmp = $_[1] - $stat_median; + $stat_avgdev += $tmp > 0 ? $tmp : -$tmp; + } + $stat_var /= $items - 1; + $stat_stddev = sqrt($stat_var); + $stat_avgdev /= $items; + if ($ts) { + printf STDERR "N=$items min=$min max=$max med=%.2f avg=%.2f stddev=%.2f avgdev=%.2f\n", + $stat_median, $stat_avg, $stat_stddev, $stat_avgdev; + } + + # Diddle this to create different marks. + @marks = ( + '[ "\s+2\(bu\s0" ]', + '[ "\(sq" ]', + '[ "\(*D" ]', + '[ "\s+2\(pl\s0" ]', + '[ "\(*F" ]', + '[ "\s+2\fB\(mu\fP\s0" ]', + '[ circle rad .035 fill 0 ]', + '[ box ht .07 wid .07 fill 1 ]', + '[ "\(dd" ]', + ); + $nmarks = $#marks + 1; + $nomark = '[ box invis ht .05 wid .05 ]'; + + $first_title = 1; + + if ($nospace) { + $graphspace = "0"; + } elsif ($small) { + $graphspace = ".15i"; + } elsif ($medium) { + $graphspace = ".20i"; + } else { + $graphspace = ".25i"; + } + + if ($small) { + $marks[0] = '[ circle rad .007 fill 1 ]'; + $PS = 10; + $ft = "B"; + $tick = .1; + } elsif ($medium) { + $PS = 11; + $ft = "HB"; + $tick = .1; + } elsif ($slide) { + $ft = "HB"; + $PS = 11; + $tick = .15; + } else { + $ft = "CB"; + $PS = 12; + $tick = .15; + } + $thk = .75; + $thk = 1 if $thk1; + $thk = 1.5 if $thk1_5; + $thk = 2 if $thk2; + $thk = .2 if $thk_2; + $gthk = .25; + $gthk = 1 if $gthk1; + $gthk = .75 if $gthk_75; + $gthk = .5 if $gthk_5; + $lineinvis = $nolines ? "invis" : ""; +} + +# Calculate min/max to autosize the graph. +sub autosize +{ + foreach $_ (@lines) { + next if /^\s*["#%]/; + next if /^\s*$/; + @_ = split; + if ($#_ == 1) { + $Ymax = $Ymin = $_[1]; + } elsif ($#_ == 5) { # Quartile plot + $Ymax = $Ymin = $_[1]; + for ($i = 2; $i <= 5; ++$i) { + $Ymax = $_[$i] if ($Ymax < $_[$i]); + $Ymin = $_[$i] if ($Ymin > $_[$i]); + } + } else { + die "Data format error: $_\n"; + } + if (!defined $xmin) { + $xmin = $_[0]; + $xmax = $_[0]; + $ymin = $Ymin; + $ymax = $Ymax; + } + else { + $xmin = $_[0] if ($xmin > $_[0]); + $xmax = $_[0] if ($xmax < $_[0]); + $ymin = $Ymin if ($ymin > $Ymin); + $ymax = $Ymax if ($ymax < $Ymax); + } + } + + # Handle fake max + if (defined($fakemax_X) && $fakemax_X > $xmax) { + $xmax = $fakemax_X; + } + if (defined($fakemax_Y) && $fakemax_Y > $ymax) { + $ymax = $fakemax_Y; + } + if (defined($fakemin_X) && $fakemin_X < $xmin) { + $xmin = $fakemin_X; + } + if (defined($fakemin_Y) && $fakemin_Y < $ymin) { + $ymin = $fakemin_Y; + } + if ($hist) { + $xmax += $hist_bsize; + } + warn "n=$items xmin=$xmin xmax=$xmax ymin=$ymin ymax=$ymax\n" if $debug; + ($xlower, $xupper, $xtick) = &tick($xmin, $xmax, $logx ? 2 : 10); + ($ylower, $yupper, $ytick) = &tick($ymin, $ymax, $logy ? 2 : 10); + if ($ymax + $ytick*.45 < $yupper) { + $yupper -= $ytick; + $ypartial = $ymax - $yupper; + } else { + $ypartial = 0; + } + $xn = int(.9 + ($xupper - $xlower) / $xtick); + $yn = int(.9 + ($yupper - $ylower) / $ytick); + $xlower = sprintf("%.6f", $xlower); # really ugly cast + $xupper = sprintf("%.6f", $xupper); # really ugly cast + $xtick = sprintf("%.6f", $xtick); # really ugly cast + $xn = sprintf("%.0f", $xn); # really ugly cast + $ylower = sprintf("%.6f", $ylower); # really ugly cast + $yupper = sprintf("%.6f", $yupper); # really ugly cast + $ytick = sprintf("%.6f", $ytick); # really ugly cast + $yn = sprintf("%.0f", $yn); # really ugly cast +} + +# Since I had to go rethink it, here's the explanation: +# +# log base e 10 = X implies e**x = 10 +# e ** (v * x) = (e ** x) ** v +# since e ** x == 10, that implies e ** (v * x) is 10 ** v +# Capeesh? +sub expbase +{ + local($base, $val) = @_; + + exp($val * log($base)); +} + +sub logbase +{ + local($base, $val) = @_; + + if ($val == 0) { + return 0; + } + if ($val < 0) { + die "Input: $_: can't take log of negative value: $val\n"; + } + log($val) / log($base); +} + +# Figure out the tick marks. +# XXX - the log stuff is not quite right. +sub tick +{ + local($min, $max, $base) = @_; + local($delta, $adj, $lower, $upper, $tick); + + $delta = $max - $min; + $tick = int(&logbase(10, $delta)); + $tick = &expbase(10, $tick - 1); + if ($delta / $tick > 10) { + if ($base == 10) { + if (($delta / (2 * $tick)) > 15) { + $adj = 10; + } elsif (($delta / (2 * $tick)) > 10) { + $adj = 5; + } else { + $adj = 2; + } + } else { + $adj = 2; + } + } else { + $adj = 1; + } + $tick *= $adj; + + # Go figure out the endpoints. This is O(log10(n)) where N is the + # number of ticks from 0 to the min. + $lower = 0; + for ($i = 10e99; $i > 0; $i = int($i/$base)) { + $fudge = $i * $tick; + $bound = $min + $fudge * .00001; + + # Sometimes it's too big + while ($lower > $bound) { + $lower -= $fudge; + } + + # Sometimes it's too small + while (($lower + $fudge) <= $bound) { + $lower += $fudge; + } + } + + if ($base == 2) { + if ($tick < 1) { + $tick = 1; + } else { + $tick = sprintf("%.0f", $tick); + } + $lower = sprintf("%.0f", $lower); + } + for ($upper = $lower; $upper < $max - $tick * .00001; $upper += $tick) { + } + if ($base == 2) { + $upper = sprintf("%.0f", $upper); + } + # If you don't like your end points on the border then do this. + unless ($close) { + if ($min - $lower < .1 * $tick) { + $lower -= $tick; + } + if ($max - $upper < .1 * $tick) { + $upper += $tick; + } + } + ($lower, $upper, $tick); +} + +# Spit out the pic stuff. +# The idea here is to spit the variables and let pic do most of the math. +# This allows tweaking of the output by hand. +sub pic +{ + if ($k) { + $print = 'sprintf("%.0fK", j/1000)'; + } else { + $print = 'sprintf("%.0f", j)'; + } + if ($grid || $halfgrid) { + $nogrid = "dotted"; + } else { + $nogrid = "invis"; + } + if ($nobox) { + $nobox = "invis"; + } + $log_x = $logx ? "logx = 1" : "logx = 0"; + $log_y = $logy ? "logy = 1" : "logy = 0"; + if ($big) { + print ".sp .5i\n.po .5i\n"; + if ($below) { + $ysize = 7; + } else { + $ysize = 9; + } + if ($nodatal) { + $xsize = 7; + } else { + $xsize = 6; + } + } elsif ($small) { + $ysize = 1.75; + $xsize = 1.75; + } elsif ($medium) { + print ".po .52i\n"; + $ysize = 1.9; + $xsize = 2.05; + } elsif ($slide) { + print ".sp .35i\n"; + $xsize = 4.5; + $ysize = 4.1; + } else { + print ".sp 1i\n"; + $ysize = 5; + $xsize = 5; + } + &graph; + + # Mark the data points + @datasets = (); + for ($sub = 0; $sub <= $#lines; $sub++) { + $_ = $lines[$sub]; + if (/^\s*$/) { # end of data set + &data($set++); + if ($grapheach) { + &titles; + if ($small) { + if ($set == 4) { + print ".sp -11i\n"; + print ".po 3.5i\n"; + } elsif ($set == 8) { + print ".sp -11i\n"; + print ".po 6i\n"; + } + } else { # ??? + if ($set == 4) { + print ".sp -11i\n"; + print ".po 3.15i\n"; + } elsif ($set == 8) { + print ".sp -11i\n"; + print ".po 5.8i\n"; + } + } + + if ($sub < $#lines) { + &graph; + } + } + next; + } + if (/^"?%fake/) { # Skip this + next; + } + if (/^"?%T\s+/) { # Title specification + # Spit out the last graph at next title. + if ($br_title && $graphs++ > 0) { + &titles; + if ($graphs == 5) { + print ".sp -11i\n"; + print ".po 3.5i\n"; + } elsif ($graphs == 9) { + print ".sp -11i\n"; + print ".po 6i\n"; + } + &graph; + } + s/^"?%T\s+//; + chop; + $Gtitle = $_; + next; + } + if (/^"?%X\s+/) { # X axis title specification + s/^"?%X\s+//; + chop; + $Xtitle = $_; + next; + } + if (/^"?%Y\s+/) { # Y axis title specification + s/^"?%Y\s+//; + chop; + $Ytitle = $_; + next; + } + if (/^"?%P\s+/) { # Page title specification + s/^"?%P\s+//; + chop; + $Ptitle = $_; + warn "Pt: $Ptitle\n"; + next; + } + if (/^"/) { # Data set title + s/^"//; + chop; + $dataset = $_; + push(@datasets, "$dataset"); + next; + } + push(@data, $_); + } + unless ($grapheach) { + &data($set++); + &titles; + } + if (defined($Ptitle)) { + print ".po 1i\n.sp -12i\n.ps 20\n.ce 1\n"; + print "$Ptitle\n"; + print ".po 1i\n.sp -12i\n.sp 10.4i\n.ps 20\n.ce 1\n"; + print "$Ptitle\n"; + } +} + +# Draw the titles and finish this graph. +sub titles +{ + # Do X/Y titles, if any. + unless ($nolabels) { + $Xtitle = defined($Xtitle) ? $Xtitle : "X"; + $Ytitle = defined($Ytitle) ? $Ytitle : "Y"; + if ($rev && $first_title) { + $tmp = $Xtitle; + $Xtitle = $Ytitle; + $Ytitle = $tmp; + } + print "\n# Xaxis title.\n"; + print "\"\\s+4$Xtitle\\s0\" rjust at O.se - (0, .6)\n"; + + print "\n# Yaxis title ($Ytitle)\n.ps +2\n"; + $tmp = $Ytitle; + while (length($tmp) > 0) { + $tmp =~ s/(.)//; + print "\"$1\" "; + } + print "\\\n at O.w - (.75, 0)\n.ps\n"; + + } + + # Do the graph title, if any. + $Gtitle = defined($Gtitle) ? $Gtitle : "Pic Graph"; + if ($grapheach) { + $Gtitle = $datasets[$#datasets]; + print "\n# Graph title.\n"; + print "\"$Gtitle\" at O.n + (0, .1)\n"; + } + + if ($br_title) { + print "\n# Graph title.\n"; + print "\"\\s+2$Gtitle\\s0\" at O.n + (0, .1)\n"; + } + + unless ($nolabels || $notitle) { + print "\n# Graph title.\n"; + if ($big) { + print "\"\\s+8$Gtitle\\s0\" at O.n + (0, .3)\n"; + } else { + print "\"\\s+4$Gtitle\\s0\" at O.n + (0, .3)\n"; + } + } + + if ($cut) { + $cutthick = .75; + print "\n# Cut marks\n"; + print "move to O.n + 0,.65; line thick $cutthick right .1\n"; + print "move to O.w - 1,0; line thick $cutthick down .1\n"; + print "move to O.e + .35,0; line thick $cutthick down .1\n"; + } + + # Do the dataset titles. + $i = 0; + unless ($nodatal) { + print "\n# Title.\n"; + if (!$grapheach) { + print ".ft R\n" if ($slide); + for ( ; $i <= $#datasets; $i++) { + print $marks[$i % $nmarks]; + if ($below) { + print " at O.sw - (0, .75 + $i * vs)\n"; + } else { + print " at O.ne + (.25, - $i * vs)\n"; + } + print + "\"$datasets[$i]\" ljust at last [].e + (.1, 0)\n"; + } + if ($cut) { + print "\nmove to O.s - 0,.75 + $i * vs\n"; + print "line thick $cutthick right .1\n"; + } + print ".ft\n" if ($slide); + } + } + + # Finish up. + print "]\n.ft\n.ps\n.PE\n"; + + # Do the statistics + if ($stats) { + $i++; + $min = sprintf "%.4f", $min; + $max = sprintf "%.4f", $max; + $stat_median = sprintf "%.4f", $stat_median; + $stat_avg = sprintf "%.4f", $stat_avg; + $stat_stddev = sprintf "%.4f", $stat_stddev; + $stat_avgdev = sprintf "%.4f", $stat_avgdev; + print <<EOF; +.ps 12 +.vs 14 +.ft CB +.po +.7i +.TS +c s +l r. +Statistics += +min $min +max $max +median $stat_median +average $stat_avg +stddev $stat_stddev +avgdev $stat_avgdev +.TE +.po -.7i +.ft +.ps +.vs +EOF + } + + $first_title = 0; +} + +sub graph +{ + if ($hist) { $hist = 1; } else { $hist = 0; } + print ".sp ${graphspace}\n"; + print <<EOF; +.PS +.ps $PS +.vs 11 +.ft $ft +[ +# Variables, tweak these. + xtick = $xtick # width of an X tick + xlower = $xlower # where the xtick start + xupper = $xupper # upper range of graph + xn = $xn # number of ticks to do + ytick = $ytick # width of an Y tick + ylower = $ylower # where the ytick start + yupper = $yupper # upper range of graph + yn = $yn # number of ticks to do + xsize = $xsize # width of the graph + ysize = $ysize # height of the graph + yscale = ysize / (yupper - ylower) # scale data to paper + xscale = xsize / (xupper - xlower) # scale data to paper + tick = $tick # distance towards numbers + gthk = $gthk # thickness of grid lines + thk = $thk # thickness of data lines + grapheach = $grapheach # doing lotso little ones? + halfgrid = $halfgrid # fewer grid lines + qthk = 2.0 # thickness of quartile lines + vs = .15 # works for 10 point fonts + hist = $hist # histogram + ypartial = $ypartial # Y spillerover + $log_x # 1 if x data is log base 2 + $log_y # 1 if y data is log base 2 + +# Draw the graph borders and tick marks + O: box $nobox thick 2 ht ysize wid xsize + if (hist) then { + # The box was invisible, draw the three sides + # The partial part i sbecause we are just too big. + line thick 2 from O.sw to O.se + line thick 2 from O.sw to O.nw + 0,ypartial*yscale + line thick 2 from O.se to O.ne + 0,ypartial*yscale + xgridlen = xsize + tick/2 + } else { + xgridlen = xsize + } + if (ysize < 2.5) then { + ysp = -.15 + xsp = -.2 + tick = tick * .75 + } else { + ysp = -.2 + xsp = -.25 + } + j = ylower + t = tick * .5 + for i = 0 to yn by 1 do { + ys = j - ylower + g = ys * yscale + # Draw the ticks to the numbers on the Y axis + line thick gthk from O.sw + (-tick, g) to O.sw + (0, g) + if (hist) then { + line thick gthk from O.se + (tick, g) to O.se + (0, g) + } + # Grid line across at same level as number ticks + line $nogrid thick gthk from O.sw + 0,g to O.sw + xsize,g + if (i < yn) then { + y2 = (ys + (ytick / 2)) * yscale + if (!halfgrid) then { + # Grid line across between number ticks + line $nogrid thick gthk from \\ + O.sw + (-t, y2) to O.sw + (xgridlen, y2) + } + } + if (logy == 1) then { + tmp = 2 ^ j; + if (tmp >= 1024*1024) then { + tmp = tmp / (1024*1024) + sprintf("%.0fM", tmp) at O.sw + ysp,g-.02 + } else { if (tmp >= 1024) then { + tmp = tmp / 1024 + sprintf("%.0fK", tmp) rjust at O.sw + ysp,g-.02 + } else { + sprintf("%.0f", tmp) rjust at O.sw + ysp,g-.02 + }} + } else { if (yupper - ylower > 999) then { + $print rjust at O.sw + ysp, g - .02 + if (hist) then { $print ljust at O.se + -ysp,g-.02 } + } else { if (yupper - ylower > 10) then { + sprintf("%.0f", j) rjust at O.sw + ysp, g - .02 + if (hist) then { + sprintf("%.0f", j) ljust at O.se + -ysp,g-.02 + } + } else { if (yupper - ylower > 1) then { + sprintf("%.1f", j) rjust at O.sw + ysp, g - .02 + sprintf("%.1f", j) rjust at O.sw + ysp, g - .02 + } else { if (yupper - ylower > .1) then { + sprintf("%.2f", j) rjust at O.sw + ysp, g - .02 + if (hist) then { + sprintf("%.2f", j) ljust at O.se + -ysp,g-.02 + } + } else { + sprintf("%.3f", j) rjust at O.sw + ysp, g - .02 + if (hist) then { + sprintf("%.3f", j) ljust at O.se + -ysp,g-.02 + } + }}}}} + j = j + ytick + } + j = xlower + even = 0 + for i = 0 to xn by 1 do { + even = !even + doit = !grapheach || xn > 9 || even + xs = j - xlower + g = xs * xscale + line thick gthk from O.sw + (g, -tick) to O.sw + (g, 0) + if (!hist) then { + line $nogrid thick gthk from O.sw + g,0 to O.sw + g,ysize + } + if (i < xn) then { + x2 = (xs + (xtick / 2)) * xscale + if (!halfgrid && !hist) then { + line $nogrid thick gthk from O.sw+x2,-t to O.sw+x2,ysize + } + } + if (logx == 1) then { + tmp = 2 ^ j; + if (tmp >= 1024*1024) then { + tmp = tmp / (1024*1024) + if (doit) then { + sprintf("%.0fM", tmp) at O.sw + g,xsp + } + } else { if (tmp >= 1024) then { + tmp = tmp / 1024 + if (doit) then { + sprintf("%.0fK", tmp) at O.sw + g,xsp + } + } else { + if (doit) then { + sprintf("%.0f", tmp) at O.sw + g,xsp + } + }} + } else { if (xupper - xlower > 999) then { + $print at O.sw + g, xsp + } else { if (xupper - xlower > 10) then { + sprintf("%.0f", j) at O.sw + g, xsp + } else { if (xupper - xlower > 1) then { + sprintf("%.1f", j) at O.sw + g, xsp + } else { if (xupper - xlower > .1) then { + sprintf("%.2f", j) at O.sw + g, xsp + } else { + sprintf("%.3f", j) at O.sw + g, xsp + }}}}} + j = j + xtick + } +EOF + # Add some statistics. + if ($stats) { + print "line from O.sw + 0,(yscale * ($stat_avg - $ylower)) " . + "to O.se + 0,(yscale * ($stat_avg - $ylower))\n"; + print "\"average\" at last line.e + .2,0 ljust\n"; + print "line from O.sw + 0,(yscale * ($stat_median - $ylower)) " . + "to O.se + 0,(yscale * ($stat_median - $ylower))\n"; + print "\"median\" at last line.e + .2,0 ljust\n"; + $tmp = $stat_median + $stat_avgdev; + print "line from O.sw + 0,(yscale * ($tmp - $ylower)) " . + "to O.se + 0,(yscale * ($tmp - $ylower))\n"; + print "\"+ avgdev\" at last line.e + .2,0 ljust\n"; + $tmp = $stat_median - $stat_avgdev; + print "line from O.sw + 0,(yscale * ($tmp - $ylower)) " . + "to O.se + 0,(yscale * ($tmp - $ylower))\n"; + print "\"- avgdev\" at last line.e + .2,0 ljust\n"; + } +} + +sub data +{ + local($mark) = int(int($_[0]) % int($nmarks)); + + print "\n# DATASET: $dataset, MARK $mark\n"; + $first = 1; + foreach $d (@data) { + next if $d =~ /^\s*"/; + next if $d =~ /^\s*#/; + next if $d =~ /^\s*$/; + @_ = split(/[ \t\n]+/, $d); + $x = sprintf("%.6g", $_[0]); + $y = sprintf("%.6g", $_[1]); + if ($#_ == 1) { + if ($hist) { + print "box fill .25 " . + "ht yscale * ($y - ylower) " . + "wid $hist_bsize * xscale " . + "with .sw at O.sw + " . + "xscale * ($x - xlower),0\n"; + } elsif ($nomarks && ($grapheach || !$first)) { + print $nomark . " at O.sw + \\\n\t" . + "(xscale * ($x - xlower), " . + "yscale * ($y - ylower))\n"; + } else { + print $marks[$mark] . + " at O.sw + \\\n\t" . + "(xscale * ($x - xlower), " . + "yscale * ($y - ylower))\n"; + } + if (!$hist && $first != 1) { + print "line $lineinvis thick thk from " . + "2nd last [].c to last [].c\n"; + } + $first = 0; + } elsif ($#_ == 5) { # Quartile graph + # Draw the lower line + print "x = xscale * ($_[0] - xlower)\n"; + print " line thick qthk from \\\n\t" . + "O.sw + x, yscale * ($_[1] - ylower) to\\\n\t" . + "O.sw + x, yscale * ($_[2] - ylower)\n"; + # Draw the mark + print " $marks[$mark]" . " at O.sw + \\\n\t" . + "x, yscale * ($_[3] - ylower)\n"; + # Draw the upper line + print " line thick qthk from \\\n\t" . + "O.sw + x, yscale * ($_[4] - ylower) to\\\n\t" . + "O.sw + x, yscale * ($_[5] - ylower)\n"; + # Connect the lines? + if ($qline) { + if ($first != 1) { + print "line thick thk from " . + "2nd last [].c to last [].c\n"; + } + } + $first = 0; + } + } + # Put a mark on the end point + if ($nomarks && !$nodatal && !$first && !$grapheach) { + print $marks[$mark] . + " at O.sw + \\\n\t" . + "(xscale * ($x - xlower), " . + "yscale * ($y - ylower))\n"; + } + @data = (); +} diff --git a/performance/lmbench3/scripts/html-list b/performance/lmbench3/scripts/html-list new file mode 100755 index 0000000..b91572d --- /dev/null +++ b/performance/lmbench3/scripts/html-list @@ -0,0 +1,123 @@ + +# Take the list of files and turn them into an html file that points +# at their context & mem latency GIFs. +# +# Usage: html-list file file file.... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1995 Larry McVoy. GPLed software. +# $Id: html-list 1.3 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +open(H, ">HTML/specific.html"); +print H <<EOF; +<title>LMBENCH System Results</title> +<h1>LMBENCH System Results</h1> +<h2><a href=summary>Summary of results</a></h2> +<hr> +EOF + +# The order that is passed in is the order of the generated +# graphs so save that. +$val = 0; +foreach $file (@ARGV) { + $number{$file} = ++$val; +} + +# Now sort them so we can group by OS +@ARGV = sort(@ARGV); + +# Figure out the different OS +foreach $file (@ARGV) { + ($os = $file) =~ s|/.*||; + push(@os, $os); + $done{$os} = 0; +} + +foreach $os (@os) { + next if $done{$os}; + $done{$os} = 1; + # Print out an OS specific heading + print H "<hr><h2>Results from $os</h2><p>\n"; + + for ($i = 0; $i <= $#os; $i++) { + $file = $ARGV[$i]; + next unless $file =~ /$os/; + open(F, $file); + $_ = <F>; + close(F); + next unless /lmbench1.[01]/; + chop; + $title = $_; + #s/.lmbench1.? results for //; + ($sys = $file) =~ s|.*/||; + if ($i > 0) { + ($prev_sys = $ARGV[$i - 1]) =~ s|.*/||; + } + if ($i < $#os) { + ($next_sys = $ARGV[$i + 1]) =~ s|.*/||; + } + print H <<EOF; +<h3>Dataset: $sys</h3> +<h4>$title</h4> +<a href="${sys}-ctx.html">Context switch details</a>, +<a href="${sys}-bwmem.html">memory bandwidths</a>, +<a href="${sys}-bwfile.html">file reread vs. memory bandwidths</a>, +and +<a href="${sys}-mem.html">memory latencies</a>. +EOF + + # Create the files referencing the data GIFs + $N = sprintf("%02d", $number{$file}); + $prev = $next = ""; + %label = ('ctx', 'context switching', + 'mem', 'memory latency', + 'bwmem', 'memory bandwidth', + 'bwfile', 'file reread bandwidth'); + %doc = ('ctx', 'lat_ctx.8.html', + 'mem', 'lat_mem_rd.8.html', + 'bwmem', 'bw_mem.8.html', + 'bwfile', 'bw_file_rd.8.html'); + $back = "<img align=middle src=\"../gifs/arrows/back.gif\">"; + $forward = "<img align=middle src=\"../gifs/arrows/forward.gif\">"; + for $what ('ctx', 'mem', 'bwmem', 'bwfile') { + for $scale ('', '-unscaled') { + open(S, ">HTML/${sys}-${what}${scale}.html"); + if ($scale eq '') { + $notscale = "-unscaled"; + $lab = ""; + $Lab = "Unscaled "; + } else { + $notscale = ""; + $lab = "scaled "; + $Lab = "Scaled "; + } + $prev = + "<a href=${prev_sys}-${what}${scale}.html> + Previous ${lab}$label{$what} result</a><p>" + if $i > 0; + $next = + "<a href=${next_sys}-${what}.html> + Next ${lab}$label{$what} result</a><p>" + if $i < $#os; + print S<<EOF; +<h4>$title</h4> +<a href=../$doc{$what}>Information on this benchmark</a> (Not up to date) +<p><IMG SRC="${what}${scale}$N.gif">\n<p> +<a href=../lmbench.html> +<img align=middle src="../gifs/arrows/b_arrow.gif">LMBENCH table of contents</a> +<a href=specific.html> +<img align=middle src=\"../gifs/graph.gif\">System results table of contents</a> +<p> +$next +$prev +<a href=${sys}-${what}${notscale}.html> +${Lab}$label{$what} results for this system</a> +EOF + } + } + + } +} +exit 0; diff --git a/performance/lmbench3/scripts/html-man b/performance/lmbench3/scripts/html-man new file mode 100755 index 0000000..8324a30 --- /dev/null +++ b/performance/lmbench3/scripts/html-man @@ -0,0 +1,83 @@ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +# Take a man tree and make an html tree out of it +# +# Derived from Donners man2html script + +from=/usr/man +to=/u/eo/repository/system/unix/man + +function disambiguate +{ +newbase=${1} +newname="${newbase}.1" +dis=2 +while [ -a "${newname}" ] + do + newname=$newbase"."$dis + dis=$(expr $dis + 1) + done +} + +while ($ARGV[0] =~ /^-/) { + if ($ARGV[0] eq "-f") { + shift(@ARGV); + $from = shift(@ARGV); + } + if ($ARGV[0] eq "-t") { + shift(@ARGV); + $to = shift(@ARGV); + } +} + +open(FD, "find $from -name '*.[0-9ln]' -print |"); +while ($find = <FD>) { +} + +if [ ! "${indexonly}" ] + then + print "Processing the man pages ..." + for i in man${sections}/* + do + if [ "$verbose" ] + then + print $i + fi + # n=${i%.*} + name=${to}/${i} + if [ -a "${name}" ] + then + oldname=$name + disambiguate $name + name=$newname + print "Collision - ${oldname} will be stored as ${name}" + fi + eqn $i | tbl | nroff -man | rman -f HTML | sed -e "s/MS_LOCAL_HOST/${localeo}/g" > ${name} + done + fi + +print "Building the index.html files ..." +cd $to +for i in man${sections} + do + if [ "$verbose" ] + then + print $i + fi + cd $i + rm -f index.html + echo '<ul>' > ../new.html + for j in * + do + if [ "$verbose" ] + then + print -n "$j " + fi + print + print "<li> <a href=$j>$j</a>" >> ../new.html + done + echo '</ul>' >> ../new.html + mv ../new.html index.html + cd .. + done diff --git a/performance/lmbench3/scripts/info b/performance/lmbench3/scripts/info new file mode 100755 index 0000000..e6860ed --- /dev/null +++ b/performance/lmbench3/scripts/info @@ -0,0 +1,7 @@ +#!/bin/sh + +UNAME=`uname -n 2>/dev/null` +if [ X$UNAME = X ] +then echo INFO +else echo INFO.$UNAME +fi diff --git a/performance/lmbench3/scripts/info-template b/performance/lmbench3/scripts/info-template new file mode 100755 index 0000000..91daa8f --- /dev/null +++ b/performance/lmbench3/scripts/info-template @@ -0,0 +1,42 @@ +Thanks very much for filling this out. The system will save it across +runs so that you don't have to do it again unless you change what you +are measuring (i.e., add disks to the mix). The stuff you fill in is +in lower case, the uppercase stuff you should leave as is. + +If you used "vi" and you don't know how to use it, just type ZZ and skip +this step. + +VENDOR: i.e. SGI, Compaq, Sun, etc. For PC clones, just say clone. + +MOTHERBOARD: this mostly for PC's - it's very important to know there. + +MODEL: SGI O200, Sun Ultra2, Compaq Pressario, Gateway 10,000,000 + +YEAR BOUGHT: 1982 + +PRICE: $10,000 + +PROCESSORS: + NUMBER: 2 + TYPE: 200 Mhz Pentium Pro + +MEMORY: + AMOUNT: 32M, etc. + SPEED: i.e, 60ns, 70ns, etc. + TYPE: FPM, EDO, DIMM, etc + +CACHE: + ONCHIP DCACHE: 32K, set associative (2 or 4 way, can't remember) + ONCHIP ICACHE: 32K, set associative (2 or 4 way, can't remember) + LEVEL 2: 1MB, 2 way set associative, unified + +NETWORK: + ETHERNET: 100baseT, DEC Tulip chip, SMC PCI card + HIPPI: 100MB/sec, 64bit PCI, SGI onboard R4K processors, full duplex + +DISKS: + /dev/sda 4GB Quantum, model 1234 + +MISC: + Anything else that you think is interesting for people + to know about your system. diff --git a/performance/lmbench3/scripts/lmbench b/performance/lmbench3/scripts/lmbench new file mode 100755 index 0000000..53ea511 --- /dev/null +++ b/performance/lmbench3/scripts/lmbench @@ -0,0 +1,483 @@ +#!/bin/sh + +# lmbench - run the lmbench benchmark suite. +# +# Hacked by Larry McVoy (lm@xxxxxxx, lm@xxxxxxx, lm@xxxxxxxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id$ + +# Make sure we can find: ./cmd, df, and netstat +PATH=.:../../scripts:$PATH:/etc:/usr/etc:/sbin:/usr/sbin +export PATH + +if [ -f $1 ] +then . $1 + echo Using config in $1 >> ${OUTPUT} +else echo Using defaults >> ${OUTPUT} + ENOUGH=1000000 + TIMING_O=0 + LOOP_O=0 + LINE_SIZE=512 +fi +export ENOUGH TIMING_O LOOP_O SYNC_MAX LINE_SIZE LMBENCH_SCHED + +if [ X$FILE = X ] +then FILE=/tmp/XXX + touch $FILE || echo Can not create $FILE >> ${OUTPUT} +fi +if [ X$MB = X ] +then MB=8 +fi +AVAILKB=`expr $MB \* 1024` + +# Figure out how big we can go for stuff that wants to use +# all and half of memory. +HALF="512 1k 2k 4k 8k 16k 32k 64k 128k 256k 512k 1m" +ALL="$HALF 2m" +i=4 +while [ $i -le $MB ] +do + ALL="$ALL ${i}m" + h=`expr $i / 2` + HALF="$HALF ${h}m" + i=`expr $i \* 2` +done + + +if [ X$FSDIR = X ] +then FSDIR=/usr/tmp/lat_fs +fi +MP=N +if [ $SYNC_MAX -gt 1 ] +then if [ "X$DISKS" != X ] + then echo "MP and disks are mutually exclusive (sorry)" + exit 1 + fi + if [ "X$REMOTE" != X ] + then echo "MP and remote networking are mutually exclusive (sorry)" + exit 1 + fi + MP=Y +fi + +# Figure out as much stuff as we can about this system. +# Sure would be nice if everyone had SGI's "hinv". +echo \[lmbench3.0 results for `uname -a`] 1>&2 +echo \[LMBENCH_VER: <version>] 1>&2 +echo \[BENCHMARK_HARDWARE: ${BENCHMARK_HARDWARE}] 1>&2 +echo \[BENCHMARK_OS: ${BENCHMARK_OS}] 1>&2 +echo \[ALL: ${ALL}] 1>&2 +echo \[DISKS: ${DISKS}] 1>&2 +echo \[DISK_DESC: ${DISK_DESC}] 1>&2 +echo \[ENOUGH: ${ENOUGH}] 1>&2 +echo \[FAST: ${FAST}] 1>&2 +echo \[FASTMEM: ${FASTMEM}] 1>&2 +echo \[FILE: ${FILE}] 1>&2 +echo \[FSDIR: ${FSDIR}] 1>&2 +echo \[HALF: ${HALF}] 1>&2 +echo \[INFO: ${INFO}] 1>&2 +echo \[LINE_SIZE: ${LINE_SIZE}] 1>&2 +echo \[LOOP_O: ${LOOP_O}] 1>&2 +echo \[MB: ${MB}] 1>&2 +echo \[MHZ: ${MHZ}] 1>&2 +echo \[MOTHERBOARD: ${MOTHERBOARD}] 1>&2 +echo \[NETWORKS: ${NETWORKS}] 1>&2 +echo \[PROCESSORS: ${PROCESSORS}] 1>&2 +echo \[REMOTE: ${REMOTE}] 1>&2 +echo \[SLOWFS: ${SLOWFS}] 1>&2 +echo \[OS: ${OS}] 1>&2 +echo \[SYNC_MAX: ${SYNC_MAX}] 1>&2 +echo \[LMBENCH_SCHED: $LMBENCH_SCHED] 1>&2 +echo \[TIMING_O: ${TIMING_O}] 1>&2 +echo \[LMBENCH VERSION: ${VERSION}] 1>&2 +echo \[USER: $USER] 1>&2 +echo \[HOSTNAME: `hostname`] 1>&2 +echo \[NODENAME: `uname -n`] 1>&2 +echo \[SYSNAME: `uname -s`] 1>&2 +echo \[PROCESSOR: `uname -p`] 1>&2 +echo \[MACHINE: `uname -m`] 1>&2 +echo \[RELEASE: `uname -r`] 1>&2 +echo \[VERSION: `uname -v`] 1>&2 + +echo \[`date`] 1>&2 +echo \[`uptime`] 1>&2 +netstat -i | while read i +do echo \[net: "$i"] 1>&2 + set `echo $i` + case $1 in + *ame) ;; + *) ifconfig $1 | while read i + do echo \[if: "$i"] 1>&2 + done + ;; + esac +done + +mount | while read i +do echo \[mount: "$i"] 1>&2 +done + +STAT=$FSDIR/lmbench +mkdir $FSDIR 2>/dev/null +touch $STAT 2>/dev/null +if [ ! -f $STAT ] +then echo "Can't make a file - $STAT - in $FSDIR" >> ${OUTPUT} + touch $STAT + exit 1 +fi +if [ X$SYNC != X ] +then /bin/rm -rf $SYNC + mkdir -p $SYNC 2>/dev/null + if [ ! -d $SYNC ] + then echo "Can't make $SYNC" >> ${OUTPUT} + exit 1 + fi +fi + +date >> ${OUTPUT} +echo Latency measurements >> ${OUTPUT} +msleep 250 +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_SYSCALL = XYES ]; then + lat_syscall -P $SYNC_MAX null + lat_syscall -P $SYNC_MAX read + lat_syscall -P $SYNC_MAX write + lat_syscall -P $SYNC_MAX stat $STAT + lat_syscall -P $SYNC_MAX fstat $STAT + lat_syscall -P $SYNC_MAX open $STAT +fi +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_SELECT = XYES ]; then + for i in 10 100 250 500 + do lat_select -n $i -P $SYNC_MAX file + done + for i in 10 100 250 500 + do lat_select -n $i -P $SYNC_MAX tcp + done +fi +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_SIG = XYES ]; then + lat_sig -P $SYNC_MAX install + lat_sig -P $SYNC_MAX catch + lat_sig -P $SYNC_MAX prot lat_sig +fi +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_PIPE = XYES ]; then + lat_pipe -P $SYNC_MAX +fi +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_UNIX = XYES ]; then + lat_unix -P $SYNC_MAX +fi +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_PROC = XYES ]; then + cp hello /tmp/hello + for i in fork exec shell + do lat_proc -P $SYNC_MAX $i + done + rm -f /tmp/hello +fi +if [ X$BENCHMARK_HARDWARE = XYES -o X$BENCHMARK_OPS = XYES ]; then + lat_ops + par_ops +fi + +rm -f $FILE + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_FILE = XYES ]; then + # choose one sample bandwidth from the middle of the pack + sample=`expr $SYNC_MAX / 2` + i=0 + while [ $i -lt $SYNC_MAX ]; do + if [ $i -eq $sample ]; then + lmdd label="File $FILE write bandwidth: " \ + of=$FILE move=${MB}m fsync=1 print=3 & + else + lmdd label="File $FILE write bandwidth: " \ + of=$FILE.$i move=${MB}m fsync=1 print=3 \ + >/dev/null 2>&1 & + fi + i=`expr $i + 1` + done + wait + rm -f $FILE.* +fi + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_PAGEFAULT = XYES ]; then + lat_pagefault -P $SYNC_MAX $FILE +fi +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_MMAP = XYES ]; then + echo "" 1>&2 + echo \"mappings 1>&2 + for i in $ALL + do lat_mmap -P $SYNC_MAX $i $FILE + done + echo "" 1>&2 +fi +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_FILE = XYES ]; then + if [ X$SLOWFS != XYES ] + then date >> ${OUTPUT} + echo Calculating file system latency >> ${OUTPUT} + msleep 250 + echo '"File system latency' 1>&2 + lat_fs $FSDIR + echo "" 1>&2 + fi +fi + +if [ X$BENCHMARK_HARDWARE = XYES ]; then + if [ X"$DISKS" != X ] + then for i in $DISKS + do if [ -r $i ] + then echo "Calculating disk zone bw & seek times" \ + >> ${OUTPUT} + msleep 250 + disk $i + echo "" 1>&2 + fi + done + fi +fi + +date >> ${OUTPUT} +echo Local networking >> ${OUTPUT} +if [ ! -d ../../src/webpage-lm ] +then (cd ../../src && tar xf webpage-lm.tar) + sync + sleep 1 +fi +SERVERS="lat_udp lat_tcp lat_rpc lat_connect bw_tcp" +for server in $SERVERS; do $server -s; done +DOCROOT=../../src/webpage-lm lmhttp 8008 & +sleep 2; + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_UDP = XYES ]; then + lat_udp -P $SYNC_MAX localhost +fi +lat_udp -S localhost + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_TCP = XYES ]; then + lat_tcp -P $SYNC_MAX localhost +fi +lat_tcp -S localhost + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_RPC = XYES ]; then + lat_rpc -P $SYNC_MAX -p udp localhost + lat_rpc -P $SYNC_MAX -p tcp localhost +fi +lat_rpc -S localhost + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_CONNECT = XYES ]; then + if [ $SYNC_MAX = 1 ]; then lat_connect localhost; fi +fi +lat_connect -S localhost + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_TCP = XYES ]; then + echo "" 1>&2 + echo "Socket bandwidth using localhost" 1>&2 + for m in 1 64 128 256 512 1024 1437 10M; do + bw_tcp -P $SYNC_MAX -m $m localhost; + done + echo "" 1>&2 +fi +bw_tcp -S localhost + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_HTTP = XYES ]; then + # I want a hot cache number + lat_http localhost 8008 < ../../src/webpage-lm/URLS > /dev/null 2>&1 + lat_http localhost 8008 < ../../src/webpage-lm/URLS +fi +lat_http -S localhost 8008 + +for remote in $REMOTE +do + echo Networking to $remote >> ${OUTPUT} + $RCP $SERVERS lmhttp ../../src/webpage-lm.tar ${remote}:/tmp + for server in $SERVERS + do $RSH $remote -n /tmp/$server -s & + done + $RSH $remote -n 'cd /tmp; tar xf webpage-lm.tar; cd webpage-lm; ../lmhttp 8008' & + sleep 10 + echo "[ Networking remote to $remote: `$RSH $remote uname -a` ]" 1>&2 + if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_UDP = XYES ]; then + lat_udp -P $SYNC_MAX $remote; + fi + lat_udp -S $remote; + + if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_TCP = XYES ]; then + lat_tcp -P $SYNC_MAX $remote; + fi + lat_tcp -S $remote; + + if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_RPC = XYES ]; then + lat_rpc -P $SYNC_MAX -p udp $remote; + lat_rpc -P $SYNC_MAX -p tcp $remote; + fi + lat_rpc -S $remote; + + if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_CONNECT = XYES ]; then + if [ $SYNC_MAX = 1 ]; then lat_connect $remote; fi + fi + lat_connect -S $remote; + + if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_TCP = XYES ]; then + echo "Socket bandwidth using $remote" 1>&2 + for m in 1 64 128 256 512 1024 1437 10M; do + bw_tcp -P $SYNC_MAX -m $m $remote; + done + echo "" 1>&2 + fi + bw_tcp -S $remote + + if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_HTTP = XYES ]; then + # I want a hot cache number + lat_http $remote 8008 < ../../src/webpage-lm/URLS > /dev/null 2>&1 + lat_http $remote 8008 < ../../src/webpage-lm/URLS + fi + lat_http -S $remote 8008 + + RM= + for server in $SERVERS + do RM="/tmp/$server $RM" + done + $RSH $remote rm $RM +done + +date >> ${OUTPUT} +echo Bandwidth measurements >> ${OUTPUT} +msleep 250 + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_UNIX = XYES ]; then + bw_unix -P $SYNC_MAX +fi + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_PIPE = XYES ]; then + bw_pipe -P $SYNC_MAX +fi + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_FILE = XYES ]; then + echo "" 1>&2 + echo \"read bandwidth 1>&2 + for i in $ALL + do bw_file_rd -P $SYNC_MAX $i io_only $FILE + done + echo "" 1>&2 + + echo \"read open2close bandwidth 1>&2 + for i in $ALL + do bw_file_rd -P $SYNC_MAX $i open2close $FILE + done + echo "" 1>&2 +fi + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_MMAP = XYES ]; then + echo "" 1>&2 + echo \"Mmap read bandwidth 1>&2 + for i in $ALL + do bw_mmap_rd -P $SYNC_MAX $i mmap_only $FILE + done + echo "" 1>&2 + + echo \"Mmap read open2close bandwidth 1>&2 + for i in $ALL + do bw_mmap_rd -P $SYNC_MAX $i open2close $FILE + done + echo "" 1>&2 + rm -f $FILE +fi + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_HARDWARE = XYES \ + -o X$BENCHMARK_BCOPY = XYES ]; then + echo "" 1>&2 + echo \"libc bcopy unaligned 1>&2 + for i in $HALF; do bw_mem -P $SYNC_MAX $i bcopy; done; echo "" 1>&2 + + echo \"libc bcopy aligned 1>&2 + for i in $HALF; do bw_mem -P $SYNC_MAX $i bcopy conflict; done; echo "" 1>&2 + + echo "Memory bzero bandwidth" 1>&2 + for i in $ALL; do bw_mem -P $SYNC_MAX $i bzero; done; echo "" 1>&2 + + echo \"unrolled bcopy unaligned 1>&2 + for i in $HALF; do bw_mem -P $SYNC_MAX $i fcp; done; echo "" 1>&2 + + echo \"unrolled partial bcopy unaligned 1>&2 + for i in $HALF; do bw_mem -P $SYNC_MAX $i cp; done; echo "" 1>&2 + + echo "Memory read bandwidth" 1>&2 + for i in $ALL; do bw_mem -P $SYNC_MAX $i frd; done; echo "" 1>&2 + + echo "Memory partial read bandwidth" 1>&2 + for i in $ALL; do bw_mem -P $SYNC_MAX $i rd; done; echo "" 1>&2 + + echo "Memory write bandwidth" 1>&2 + for i in $ALL; do bw_mem -P $SYNC_MAX $i fwr; done; echo "" 1>&2 + + echo "Memory partial write bandwidth" 1>&2 + for i in $ALL; do bw_mem -P $SYNC_MAX $i wr; done; echo "" 1>&2 + + echo "Memory partial read/write bandwidth" 1>&2 + for i in $ALL; do bw_mem -P $SYNC_MAX $i rdwr; done; echo "" 1>&2 +fi + +if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_CTX = XYES ]; then + date >> ${OUTPUT} + echo Calculating context switch overhead >> ${OUTPUT} + msleep 250 + if [ $MB -ge 8 ] + then CTX="0 4 8 16 32 64" + N="2 4 8 16 24 32 64 96" + else + CTX="0 4 8 16 32" + N="2 4 8 16 24 32 64 96" + fi + + echo "" 1>&2 + for size in $CTX + do + lat_ctx -P $SYNC_MAX -s $size $N + done + echo "" 1>&2 +fi + +if [ X$BENCHMARK_HARDWARE = XYES -o X$BENCHMARK_MEM = XYES ]; then + if [ $SYNC_MAX = 1 ]; then + date >> ${OUTPUT} + echo Calculating effective TLB size >> ${OUTPUT} + msleep 250 + tlb -L $LINE_SIZE -M ${MB}M + echo "" 1>&2 + + date >> ${OUTPUT} + echo Calculating memory load parallelism >> ${OUTPUT} + msleep 250 + echo "Memory load parallelism" 1>&2 + par_mem -L $LINE_SIZE -M ${MB}M + echo "" 1>&2 + +# date >> ${OUTPUT} +# echo Calculating cache parameters >> ${OUTPUT} +# msleep 250 +# cache -L $LINE_SIZE -M ${MB}M + fi + + date >> ${OUTPUT} + echo McCalpin\'s STREAM benchmark >> ${OUTPUT} + msleep 250 + stream -P $SYNC_MAX -M ${MB}M + stream -P $SYNC_MAX -v 2 -M ${MB}M + + date >> ${OUTPUT} + echo Calculating memory load latency >> ${OUTPUT} + msleep 250 + echo "" 1>&2 + echo "Memory load latency" 1>&2 + if [ X$FASTMEM = XYES ] + then lat_mem_rd -P $SYNC_MAX $MB 128 + else lat_mem_rd -P $SYNC_MAX $MB 16 32 64 128 256 512 1024 + fi + echo "" 1>&2 + echo "Random load latency" 1>&2 + lat_mem_rd -t -P $SYNC_MAX $MB 16 + echo "" 1>&2 +fi + +date >> ${OUTPUT} +echo '' 1>&2 +echo \[`date`] 1>&2 + +exit 0 diff --git a/performance/lmbench3/scripts/make b/performance/lmbench3/scripts/make new file mode 100755 index 0000000..59bf238 --- /dev/null +++ b/performance/lmbench3/scripts/make @@ -0,0 +1,20 @@ +#!/bin/sh + +if [ "X$MAKE" != "X" ] && echo "$MAKE" | grep -q '`' +then + MAKE= +fi + +if [ X$MAKE = X ] +then MAKE=make + for p in `echo $PATH | sed 's/:/ /g'` + do if [ -f $p/gmake ] + then + if $p/gmake testmake > /dev/null 2>&1 + then + MAKE=$p/gmake + fi + fi + done +fi +echo $MAKE diff --git a/performance/lmbench3/scripts/man2html b/performance/lmbench3/scripts/man2html new file mode 100755 index 0000000..742f69f --- /dev/null +++ b/performance/lmbench3/scripts/man2html @@ -0,0 +1,254 @@ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +# Usage $0 manpage +# Parse my man page formats. + +die "Usage: $0 [ manpage ] \n" unless $#ARGV <= 0; + +$firstSH = 1; +$inDL = 0; + +warn "Doing $ARGV[0]\n"; + +open(STDIN, "$ARGV[0]") if ($#ARGV == 0); + +while (<>) { + next if (/^\.\\"/); + + if (/^\.TH\s/) { + # .TH BW_MEM_CP 8 "$Date: 00/01/31 15:29:42-08:00 $" "(c)1994 Larry McVoy" "LMBENCH" + split; + print "<TITLE>$_[1]($_[2]) - LMBENCH man page</TITLE>\n"; + print "<H2>$_[1]($_[2]) - LMBENCH man page</H2><HR>\n"; + next; + } + + if (/^\.SH\s/) { + s/.SH\s+//; + s/"//g; + chop; + print "</DL>\n" unless $firstSH; $firstSH = 0; + print "</DL>\n" if $inDL; $inDL = 0; + print "<DL><DT><H4>$_</H4><DD>\n"; + next; + } + + next if &fontfont; + + if (/^\.LP\s/ || /^\.PP/) { + s/..P\s+//; + chop; + print "<P>\n"; + next; + } + + if (/^\.TP/) { # treat as a DT list + $_ = <>; + &html; + chop; + print "</DL>\n" if ($inDL); + print "<DL><DT>"; + print unless &fontfont; + print "<DD><BR>\n"; + $inDL = 1; + next; + } + + if (/^\.IP/) { # treat as a DT list + s/^\.IP\s*//; + chop; + s/"//; + s/".*//; + &html; + print "</DL>\n" if ($inDL); + print "<DL><DT>$_<DD><BR>\n"; + $inDL = 1; + next; + } + + if (/^\.sp/) { + print "<PRE>\n</PRE>\n"; + next; + } + + next if (/^\.in/ || /^\.ps/); # skip this stuff. + + if (/^\.br/) { + print "<BR>\n"; + next; + } + + if (/^\.nf/ || /^\.DS/) { # starting a display + print "<PRE>\n"; + while (<>) { + last if /^\.fi/; + last if /^\.DE/; + next if /^\./; + &html; + print "\t$_"; # XXX - a screwy way of indenting + } + print "</PRE>\n"; + next; + } + + if (/^\.ft C[WB]/) { + local($pre) = 0; + + print "<CODE>\n"; + while (<>) { + last if /^\.ft\s*$/; + if (/^\.nf/) { + $pre = 1; + print "<PRE>\n"; + next; + } + if ($pre && /^\.fi/) { + print "</PRE>\n"; + $pre = 0; + next; + } + next if /^\.br/; + &html; + print; + } + print "</CODE>\n"; + next; + } + + if (/\\f\(C[WB]/) { + &html; + s/\\f\(C[WB]/<CODE>/; + while (!/\\f/) { + &html; + print; + $_ = <>; + } + s/\\fP/<\/CODE>/; + print; + next; + } + + if (/\\fB/) { + &html; + s/\\fB/<STRONG>/; + while (!/\\f/) { + print; + $_ = <>; + &html; + } + s/\\fP/<\/STRONG>/; + print; + next; + } + + if (/\\fI/) { + &html; + s/\\fB/<EM>/; + while (!/\\f/) { + print; + $_ = <>; + &html; + } + s/\\fP/<\/EM>/; + print; + next; + } + + if (/^\.ti/) { # one line display + print "<PRE>\n"; + $_ = <>; + &html; + print; + print "</PRE>\n"; + next; + } + + if (/^\.de\s+/) { + s/^\.de\s+//; + warn "$ARGV[0]: Ignoring definition: $_"; + while (<>) { + last if /^\.\./; + } + next; + } + + # Warn about unimplemented troff/man commands + if (/^\./) { + chop; + warn "$ARGV[0] unimp: \"$_\"\n"; + next; + } + + if (/\\f/) { + warn "$ARGV[0]: missed font: \"$_\"\n"; + } + + # Catchall for all the weirdball things I do. + s/^\\\&\.\\\|\.\\\|\./.../; + s/\\-/-/; + + &html; + + print; +} +exit 0; + +sub html +{ + # HTML things that I've encountered. + s/"/"/g; + s/</</g; + s/>/>/g; +} + +sub fontfont { + + if (/^\.BI\s/) { + s/.BI\s+//; + chop; + split; + print "<STRONG>$_[0]</STRONG><EM>$_[1]</EM>\n"; + return 1; + } + + if (/^\.IB\s/) { + s/.IB\s+//; + chop; + split; + print "<EM>$_[0]</EM><STRONG>$_[1]</STRONG>\n"; + return 1; + } + + if (/^\.IR\s/) { + s/.IR\s+//; + chop; + split; + print "<EM>$_[0]</EM>$_[1]\n"; + return 1; + } + + if (/^\.BR\s/) { + s/.BR\s+//; + chop; + split; + print "<STRONG>$_[0]</STRONG>$_[1]\n"; + return 1; + } + + if (/^\.B\s/) { + s/.B\s+//; + chop; + print "<STRONG>$_</STRONG>\n"; + return 1; + } + + if (/^\.I\s/) { + s/.I\s+//; + chop; + print "<EM>$_</EM>\n"; + return 1; + } + + return 0; +} diff --git a/performance/lmbench3/scripts/mkrelease b/performance/lmbench3/scripts/mkrelease new file mode 100755 index 0000000..be50f03 --- /dev/null +++ b/performance/lmbench3/scripts/mkrelease @@ -0,0 +1,23 @@ +#!/bin/sh + +# %W% +# +# XXX - does not check for checked out files. + +make -s clean +make -s get +VERS=`egrep 'MAJOR|MINOR' src/version.h | awk '{print $3}'` +set `echo $VERS` +if [ $2 -lt 0 ] +then VERS=`echo $1$2 | sed s/-/alpha/` +else VERS=`echo $VERS |sed 's/ /./'` +fi +D=lmbench-$VERS +mkdir $D $D/results +cp -rp SCCS doc hbench-REBUTTAL lmbench-HOWTO scripts src $D +cp -rp results/SCCS $D/results +(cd $D && make -s get) +/bin/rm -rf $D/SCCS $D/*/SCCS +tar czvf $D.tgz $D +/bin/rm -rf $D +make -s clean diff --git a/performance/lmbench3/scripts/new2oldctx b/performance/lmbench3/scripts/new2oldctx new file mode 100755 index 0000000..3e2ed48 --- /dev/null +++ b/performance/lmbench3/scripts/new2oldctx @@ -0,0 +1,31 @@ + +# Convert the new format: +# Context switch of 8 4k processes: 64.17 (60.02 overhead) +# to the old format: +#"size=0 ovr=22 +# 2 8 +# 4 14 +# 8 18 +# 16 21 +# 20 22 + +eval 'exec perl -Ssw $0 "$@"' + if 0; + +@lines = grep(/Context switch/, <>); +foreach $size ("0k", "4k", "16k", "32k", "64k") { + @data = grep(/$size/, @lines); + @a = @b = @c = (); + $i = 0; + foreach $n (2, 4, 8, 16, 20) { + @tmp = (); + foreach $_ (grep(/of $n/, @data)) { + @_ = split; + push(@tmp, "$_[3] $_[6]\n"); + } + ($a[$i],$b[$i],$c[$i]) = @tmp; + $i++; + } + print "\n\"size=$size \n"; + print @c; +} diff --git a/performance/lmbench3/scripts/opercent b/performance/lmbench3/scripts/opercent new file mode 100755 index 0000000..8f34c1e --- /dev/null +++ b/performance/lmbench3/scripts/opercent @@ -0,0 +1,92 @@ + +eval "exec perl -sS $0 $*" + if 0; + +$fmt = 0; +@fmts = ( +"%33s %4s %4s %3s %4s %4s %4s %4s %4s %4s\n", +"%28s %6s %6s %5s %6s %7s %7s\n", +"%29s %5s %4s %5s %5s %5s %5s %4s\n", +"%30s %6s %6s %6s %8s %5s %7s\n", +"%28s %4s %4s %6s %6s %6s %6s %4s %5s\n", +"%29s %5s %6s %11s\n", +); +while (<>) { + print; + next unless /^Host/; + $_ = <>; print; + unless (/^-/) { + $_ = <>; print; + } + @values = (); + @a = @b = @c = @d = @e = @f = @g = @h = @i = @j = @k = (); + $i = 0; + while (<>) { + last if /^\s/; + print; + s/.......................\s+//; + ($a[$i],$b[$i],$c[$i],$d[$i],$e[$i],$f[$i],$g[$i],$h[$i],$i[$i],$j[$i],$k[$i]) = split; + $i++; + } + $a = &sss(@a) if $#a != -1; + $b = &sss(@b) if $#b != -1; + $c = &sss(@c) if $#c != -1; + $d = &sss(@d) if $#d != -1; + $e = &sss(@e) if $#e != -1; + $f = &sss(@f) if $#f != -1; + $g = &sss(@g) if $#g != -1; + $h = &sss(@h) if $#h != -1; + $i = &sss(@i) if $#i != -1; + $j = &sss(@j) if $#j != -1; + $k = &sss(@k) if $#k != -1; + printf $fmts[$fmt], $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k; + print "\n"; + exit if $fmt++ == $#fmts; +} + +sub sss +{ + local($tmp); + local(@values) = (); + local($n, $sum, $min, $max) = (0,0,1.7E+300,2.2E-300); + + foreach $_ (@_) { + next unless /^\d/; + chop if /K$/; + push(@values, $_); + $sum += $_; + $min = $_ if $_ < $min; + $max = $_ if $_ > $max; + $n++; + } + return "" if $#values == -1; + # Do some statistics. + @s = sort(@values); + if ($n & 1) { + $median = $s[($n + 1)/2]; + } else { + $i = $n / 2; + $median = ($s[$i] + $s[$i+1]) / 2; + } + $avg = $sum/$n; + $avgdev = $var = 0; + foreach $_ (@values) { + $var += ($_ - $median) ** 2; + $tmp = $_ - $median; + $avgdev += $tmp > 0 ? $tmp : -$tmp; + } + $var /= $n - 1; + $stddev = sqrt($var); + $avgdev /= $n; + #printf("%8s %8s %8s %8s %8s %4s %8s\n", "Min", "Max", "Average", "Median", "Std Dev", "%", "Avg Dev"); + #printf "%8.2f %8.2f %8.2f %8.2f %8.2f %4.1f%% %8.2f\n", $min, $max, $avg, $median, $stddev, $stddev/$median*100, $avgdev; + $percent = $stddev/$median*100; + if ($percent > 90) { + printf "Huh: $percent $stddev $median @values\n"; + } + if ($percent >= 10) { + return sprintf "%.0f%%", $percent; + } else { + return sprintf "%.1f%%", $percent; + } +} diff --git a/performance/lmbench3/scripts/os b/performance/lmbench3/scripts/os new file mode 100755 index 0000000..ea767c6 --- /dev/null +++ b/performance/lmbench3/scripts/os @@ -0,0 +1,20 @@ +#!/bin/sh + +if [ "X$OS" != "X" ] && echo "$OS" | grep -q '`' +then + OS= +fi + +if [ "X$OS" = "X" ] +then OS=bloat-os + MACHINE=`uname -m | sed -e 's/ //g' | sed -e 's?/?-?g'` + SYSTEM=`uname -s | sed -e 's/ //g' | sed -e 's?/?-?g'` + OS="${MACHINE}-${SYSTEM}" + if [ -f ../scripts/gnu-os ] + then OS=`../scripts/gnu-os | sed s/unknown-//` + fi + if [ -f ../../scripts/gnu-os ] + then OS=`../../scripts/gnu-os | sed s/unknown-//` + fi +fi +echo $OS diff --git a/performance/lmbench3/scripts/output b/performance/lmbench3/scripts/output new file mode 100755 index 0000000..2a204e3 --- /dev/null +++ b/performance/lmbench3/scripts/output @@ -0,0 +1,10 @@ +#!/bin/sh +trap "echo /dev/null" 20 +OUTPUT=/dev/null; export OUTPUT +if [ -w /dev/tty ]; then + if echo "" > /dev/tty; then + OUTPUT=/dev/tty; export OUTPUT + fi +fi 2>/dev/null +echo "${OUTPUT}" +exit 0 diff --git a/performance/lmbench3/scripts/percent b/performance/lmbench3/scripts/percent new file mode 100755 index 0000000..9b98cd9 --- /dev/null +++ b/performance/lmbench3/scripts/percent @@ -0,0 +1,95 @@ + +eval "exec perl -sS $0 $*" + if 0; + +$fmt = 0; +@fmts = ( +"%24s %4s %4s %3s %4s %5s %4s %4s %4s %5s %4s %3s\n", +"%24s %4s %6s %5s %5s %6s %7s %7s\n", +"%24s %4s %5s %4s %5s %5s %5s %5s %4s\n", +"%24s %6s %6s %6s %8s %5s %7s\n", +"%24s %3s %4s %4s %6s %7s %6s %5s %5s %5s\n", +"%24s %5s %5s %5s %12s\n", +); +while (<>) { + print; + next unless /^Host/; + $_ = <>; print; + unless (/^-/) { + $_ = <>; print; + } + @values = (); + @a = @b = @c = @d = @e = @f = @g = @h = @i = @j = @k = (); + $i = 0; + while (<>) { + last if /^\s/; + print; + s/.......................\s+//; + ($a[$i],$b[$i],$c[$i],$d[$i],$e[$i],$f[$i],$g[$i],$h[$i],$i[$i],$j[$i],$k[$i]) = split; + $i++; + } + $a = &sss(@a) if $#a != -1; + $b = &sss(@b) if $#b != -1; + $c = &sss(@c) if $#c != -1; + $d = &sss(@d) if $#d != -1; + $e = &sss(@e) if $#e != -1; + $f = &sss(@f) if $#f != -1; + $g = &sss(@g) if $#g != -1; + $h = &sss(@h) if $#h != -1; + $i = &sss(@i) if $#i != -1; + $j = &sss(@j) if $#j != -1; + $k = &sss(@k) if $#k != -1; + printf $fmts[$fmt], "", $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k; + print "\n"; + exit if $fmt++ == $#fmts; +} + +sub sss +{ + local($i, $tmp); + local(@values) = (); + local($n, $sum, $min, $max) = (0,0,1.7E+300,2.2E-300); + + foreach $_ (@_) { + next unless /^\d/; + chop if /K$/; + push(@values, $_); + $sum += $_; + $min = $_ if $_ < $min; + $max = $_ if $_ > $max; + $n++; + } + return "" if $#values == -1; + # Do some statistics. + @s = sort(@values); + if ($n & 1) { + $median = $s[($n + 1)/2]; + } else { + $i = $n / 2; + $median = ($s[$i] + $s[$i+1]) / 2; + } + $avg = $sum/$n; + $avgdev = $var = 0; + foreach $_ (@values) { + $var += ($_ - $median) ** 2; + $tmp = $_ - $median; + $avgdev += $tmp > 0 ? $tmp : -$tmp; + } + $var /= $n - 1; + $stddev = sqrt($var); + $avgdev /= $n; + #printf("%8s %8s %8s %8s %8s %4s %8s\n", "Min", "Max", "Average", "Median", "Std Dev", "%", "Avg Dev"); + #printf "%8.2f %8.2f %8.2f %8.2f %8.2f %4.1f%% %8.2f\n", $min, $max, $avg, $median, $stddev, $stddev/$median*100, $avgdev; + $percent = $stddev/$median*100; + if ($percent > 90) { + printf "Huh: $percent $stddev $median @values\n"; + } + if ($percent < .5) { + return "0 "; + } elsif ($percent < 1) { + $tmp = sprintf "%.1f%%", $percent; + return $tmp; + } else { + return sprintf "%.0f%%", $percent; + } +} diff --git a/performance/lmbench3/scripts/rccs b/performance/lmbench3/scripts/rccs new file mode 100755 index 0000000..def0785 --- /dev/null +++ b/performance/lmbench3/scripts/rccs @@ -0,0 +1,733 @@ + +eval 'exec perl -Ssw $0 "$@"' + if 0; + +# Mimic the BSD tool, sccs, for RCS. +# $Id: rccs 1.7 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $ +# +# Note - this reflects a lot of my personal taste. I'll try and list the +# important differences here: +# +# A bunch of unused commands are not implemented. It is easy to add them, +# mail me if you want me to add something. Please include a spec of what +# you want the command to do. Mail lm@xxxxxxxxxxxx. +# +# I look at RCS file internals and know about certain fields as of revision +# 5.x. +# +# This interface does not require a list of files/directories for most +# commands; the implied list is *,v and/or RCS/*,v. Destructive commands, +# such as clean -f, unedit, unget, do *not* have an implied list. In +# other words, +# rccs diffs is the same as rccs diffs RCS +# but +# rccs unedit is not the same as rccs unedit RCS +# +# If you add (potentially) destructive commands, please check for +# them in main() and make sure that the autoexpand does not happen. +# +# TODO: +# Make it so that you can pass a list of files/dirs via stdin. +# +# It might be nice to have all the "system" args printed out in +# verbose and/or learn mode. Depends on whether you want people +# to learn RCS or not. + +&init; +&main; +exit 0; # probably not reached. + +sub init +{ + $0 =~ s|.*/||; + # Add commands here so that -w shuts up. + $lint = 0; + + &clean() && &create() && &example() && &get() && &edit() && + &unedit() && &unget() && &diffs() && &delta() && &help() && + &prs() && &prt() && &deledit() && &delget() && &enter() && + &info() && &ci() && &co() && &fix() && &print() + if $lint; +} + +sub help +{ + if ($#_ == -1) { + &usage; + } + + # Handle all the aliases. + if ($_[0] eq "unedit" || $_[0] eq "unget") { + &help("clean"); + } elsif ($_[0] eq "clean") { + } + warn "Extended help on @_ not available yet.\n"; +} + +sub usage +{ +print <<EOF; + +usage: $0 [$0 opts] command [args] [file and/or directory list] + +$0 options are: + -debug for debugging of $0 itself + -verbose for more information about what $0 is doing + +More information may be had by saying "$0 help subcommand". + +Most commands take "-s" to mean do the work silently. + +Command Effect +------- ------ + clean - remove unedited (ro) working files + -e remove unmodified edited (rw) & unedited (ro) files + -f (force) remove modified working files as well + create - add a set of files to RCS control and get (co) the working files + -g do not do the get (co) of the working files + -y<msg> use <msg> as the description message (aka -d<msg>) + delta - check in a revision + -y<msg> use <msg> as the log message (aka -d<msg>) + -s + diffs - diff the working file against the RCS file + fix - redit the last revision + get - get the working file[s] (possibly for editing) + history - print history of the files + print - print the history and the latest contents + +Alias Real command Effect +----- ------------ ------ + ci - delta check in a revision + co - get check out a revision + enter - create -g initialize a file without a get afterward + unedit - clean -f remove working file even if modified + unget - clean -f remove working file even if modified + edit - get -e check out the file for editing + prs - history print change log history + prt - history print change log history + +An implied list of *,v and/or RCS/*,v is implied for most commands. +The exceptions are commands that are potentially destructive, such as +unedit. + +EOF + + exit 0; +} + +sub main +{ + local($cmd); + local(@args); + local(@comma_v); + + $cmd = "oops"; + $cmd = shift(@ARGV) if $#ARGV > -1; + &help(@ARGV) if $cmd eq "help" || $cmd eq "oops"; + + $dir_specified = $file_specified = 0; + foreach $_ (@ARGV) { + # If it is an option, just pass it through. + if (/^-/) { + push(@args, $_); + } + # If they specified an RCS directory, explode it into ,v files. + elsif (-d $_) { + $dir_specified = 1; + warn "Exploding $_\n" if $debug; + push(@args, grep(/,v$/, &filelist($_))); + push(@args, grep(/,v$/, &filelist("$_/RCS"))); + } + # If it is a file, make it be the ,v file. + else { + if (!/,v$/) { + # XXX - what if both ./xxx,v and ./RCS/xxx,v? + if (-f "$_,v") { + $_ .= ",v"; + } else { + if (m|/|) { + m|(.*)/(.*)|; + $f = "$1/RCS/$2,v"; + } else { + $f = "RCS/$_,v"; + } + if (-f $f) { + $_ = $f; + } + } + } + if (-f $_) { + $file_specified = 1; + warn "Adding $_\n" if $debug; + push(@args, $_); + } else { + warn "$0: skipping $_, no RCS file.\n"; + } + } + } + + # Figure out if it is a potentially destructive command. These + # commands do not automagically expand *,v and RCS/*,v. + $destructive = ($cmd eq "clean" && $args[0] eq "-f") || + $cmd eq "unedit" || $cmd eq "unget"; + + # If they didn't specify a file or a directory, generate a list + # of all ./*,v and ./RCS/*,v files. + unless ($destructive || $dir_specified || $file_specified) { + warn "Exploding . && ./RCS\n" if $debug; + push(@args, grep(/,v$/, &filelist("."))); + push(@args, grep(/,v$/, &filelist("RCS"))); + } + + unless ($cmd =~ /^create$/) { + @comma_v = grep(/,v$/, @args); + if ($#comma_v == -1) { + ($s = "$cmd @ARGV") =~ s/\s+$//; + die "$0 $s: No RCS files specified.\n"; + } + } + + # Exit codes: + # 0 - it worked + # 1 - unspecified error + # 2 - command unknown + $exit = 2; + warn "Trying &$cmd(@args)\n" if $debug; + eval(&$cmd(@args)); + + if ($exit == 2) { + warn "Possible unknown/unimplemented command: $cmd\n"; + &usage; + } else { + exit $exit; + } +} + +# Read the directory and return a list of files. +# XXX - isn't there a builtin that does this? +sub filelist +{ + local(@entries) = (); + local($ent); + + opendir(DFD, $_[0]) || return (); + foreach $ent (readdir(DFD)) { + $ent = "$_[0]/$ent"; + next unless -f $ent; + push(@entries, $ent); + } + warn "filelist($_[0]): @entries\n" if $debug; + @entries; +} + +# Take a list of ,v files and return a list of associated working files. +sub working +{ + local(@working, $working) = (); + + foreach $comma_v (@_) { + # Strip the ,v. + # Strip the RCS specification. + ($working = $comma_v) =~ s|,v$||; + $working =~ s|RCS/||; + push(@working, $working); + } + @working; +} + +# Same as "clean -f" - throw away all changes +sub unedit { &clean("-f", @_); } +sub unget { &clean("-f", @_); } + +# Get rid of everything that isn't edited and has an associated RCS file. +# -e remove edited files that have not been changed. +# -f remove files that are edited with changes (CAREFUL!) +# This implies the -e opt. +# -d<m> Check in files that have been modified. If no message, prompt +# on each file. This implies -e. +# -y<m> Like -d for people that are used to SCCS. +# -m<m> Like -d for people that are used to RCS. +# +# Note: this does not use rcsclean; I don't know when that showed up. And +# the 5.x release of RCS I have does not install it. +sub clean +{ + local(@working); + local($e_opt, $f_opt, $d_opt, $s_opt) = (0,0,0,0); + local($msg); + local(@checkins) = (); + + while ($_[0] =~ /^-/) { + if ($_[0] eq "-s") { + $s_opt = 1; + shift(@_); + } elsif ($_[0] eq "-e") { + $e_opt = 1; + shift(@_); + } elsif ($_[0] eq "-f") { + $f_opt = $e_opt = 1; + shift(@_); + } elsif ($_[0] =~ /^-[dym]/) { + $d_opt = $e_opt = 1; + if ($_[0] =~ /^-[dym]$/) { + $msg = $_[0]; + } else { + ($msg = $_[0]) =~ s/-[ydm]//; + $msg = "-m'" . $msg . "'"; + } + shift(@_); + } else { + die "$0 clean: unknown option: $_[0]\n"; + } + } + + @working = &working(@_); + for ($i = 0; $i <= $#_; ++$i) { + # No working file? + if (!-f $working[$i]) { + warn "No working file $working[$i] for $_[$i]\n" + if $debug; + next; + } + + # Read only? Unlink. + if (!-w $working[$i]) { + warn "rm $working[$i]\n" unless $s_opt; + # Make sure there is an RCS file + if (-f $_[$i]) { + # XXX - what if ro and edited? + unlink($working[$i]) unless $n; + } else { + warn "clean: no RCS file for $working[$i]\n"; + } + next; + } + + # If they just want to know about it, tell them. + if ($e_opt == 0) { + open(RCS, $_[$i]); + while (defined($r = <RCS>)) { + last if $r =~ /locks/; + } + @locks = (); + while (defined($r = <RCS>)) { + # XXX - I use "comment" a delimiter. + last if $r =~ /comment/; + $r =~ s/^\s+//; + chop($r); + push(@locks, $r); + } + close(RCS); + if ($#locks > -1) { + warn "$working[$i]: being edited: @locks\n"; + } else { + warn "$working[$i]: " . + "writeable but not edited?!?\n"; + } + next; + } + + # See if there have actually been any changes. + # Notice that this is cmp(1) in about 10 lines of perl! + open(RCS, "co -q -p -kkvl $_[$i] |"); + open(WORK, $working[$i]); + $diff = 0; + while (defined($r = <RCS>)) { + unless (defined($w = <WORK>) && ($r eq $w)) { + $diff = 1; + last; + } + } + if (defined($w = <WORK>)) { + $diff = 1; + } + close(RCS); close(WORK); + if ($diff) { + if ($f_opt) { + warn "Clean modified $working[$i]\n" + unless $s_opt; + unless ($n) { + unlink($working[$i]); + system "rcs -q -u $_[$i]"; + } + } elsif ($d_opt) { + push(@checkins, $_[$i]); + } else { + warn "Can't clean modified $working[$i]\n"; + } + next; + } else { + warn "rm $working[$i]\n" unless $s_opt; + unless ($n) { + unlink($working[$i]); + system "rcs -q -u $_[$i]"; + } + } + } + + # Handle files that needed deltas. + if ($#checkins > -1) { + warn "ci -q $msg @checkins\n" if $verbose; + system "ci -q $msg @checkins"; + } + + $exit = 0; +} + +# Create - initialize the RCS file +# -y<c> - use <c> as the description message for all files. +# -d<c> - use <c> as the description message for all files. +# -g - don't do the get +# +# Differs from sccs in that it does not preserve the original +# files (I never found that very useful). +sub create +{ + local($arg, $noget, $description, $cmd) = ("", "", ""); + + foreach $arg (@_) { + # Options... + if ($arg =~ /^-[yd]/) { + ($description = $arg) =~ s/^-[yd]//; + $arg = ""; + warn "Desc: $description\n" if $debug; + next; + } + if ($arg eq "-g") { + $noget = "yes"; + $arg = ""; + next; + } + next if ($arg =~ /^-/); + + # If no RCS subdir, make one. + if ($arg =~ m|/|) { # full path + ($dir = $arg) =~ s|/[^/]+$||; + mkdir("$dir/RCS", 0775); + } else { # in $CWD + mkdir("RCS", 0775); + } + } + $exit = 0; + if ($description ne "") { + $cmd = "ci -t-'$description' @_"; + } else { + $cmd = "ci @_"; + } + warn "$cmd\n" if $verbose; + system "$cmd"; + system "co @_" unless $noget; +} + +# Like create without the get. +sub enter { &create("-g", @_); } + +# Edit - get the working file editable +sub edit { &get("-e", @_); } + +# co - normal RCS +sub co { &get(@_); } + +# Get - get the working file +# -e Retrieve a version for editing. +# Same as co -l. +# -p Print the file to stdout. +# -k Suppress expansion of ID keywords. +# Like co -kk. +# -s Suppress all output. +# +# Note that all other options are passed to co(1). +sub get +{ + local($arg, $working, $f, $p); + + $f = $p = 0; + foreach $arg (@_) { + # Options... + $arg = "-l" if ($arg eq "-e"); + $arg = "-kk" if ($arg eq "-k"); + $arg = "-q" if ($arg eq "-s"); + $f = 1 if ($arg eq "-f"); + $p = 1 if ($arg eq "-p"); # XXX - what if -sp? + + next if $arg =~ /^-/ || $p; + + # Check for writable files and skip them unless someone asked + # for co's -f option. + ($working = $arg) =~ s|,v$||; + $working =~ s|RCS/||; + if ((-w $working) && $f == 0) { + warn "ERROR [$arg]: writable `$working' exists.\n"; + $arg = ""; + } + } + @files = grep(/,v/, @_); + if ($#files == -1) { + warn "$0 $cmd: no files to get. @_\n"; + $exit = 1; + } else { + system "co @_"; + $exit = 0; + } +} + +# Aliases for history. +sub prt { &history(@_); } +sub prs { &history(@_); } + +# History - change history sub command +sub history +{ + local(@history); + + open(RL, "rlog @_|"); + # Read the whole history + while (defined($r = <RL>)) { + # Read the history for one file. + if ($r !~ /^[=]+$/) { + push(@history, $r); + next; + } + &print_history(@history); + @history = (); + } + close(RL); + print "+-----------------------------------\n"; + $exit = 0; +} + +sub print_history +{ + for ($i = 0; $i <= $#_; ++$i) { + # Get the one time stuff + if ($_[$i] =~ /^RCS file:/) { + $_[$i] =~ s/RCS file:\s*//; + chop($_[$i]); + print "+------ $_[$i] -------\n|\n"; + } + + # Get the history + if ($_[$i] =~ /^----------------------------/) { + local($rev, $date, $author, $lines) = ("", "", "", ""); + + $i++; + die "Bad format\n" unless $_[$i] =~ /revision/; + $_[$i] =~ s/revision\s+//; + chop($_[$i]); + $rev = $_[$i]; + $i++; + die "Bad format\n" unless $_[$i] =~ /date/; + @parts = split(/[\s\n;]+/, $_[$i]); + for ($j = 0; $j <= $#parts; $j++) { + if ($parts[$j] =~ /date/) { + $j++; + $date = "$parts[$j] "; + $j++; + $date .= "$parts[$j]"; + } + if ($parts[$j] =~ /author/) { + $j++; + $author = $parts[$j]; + } + if ($parts[$j] =~ /lines/) { + $j++; + $lines = "$parts[$j] "; + $j++; + $lines .= "$parts[$j]"; + } + } + print "| $rev $date $author $lines\n"; + while ($_[++$i] && + $_[$i] !~ /^----------------------------/) { + print "| $_[$i]"; ### unless $rev =~ /^1\.1$/; + } + print "|\n"; + $i--; + } + } +} + +# Show changes between working file and RCS file +# +# -C -> -c for compat with sccs (not sure if this is needed...). +sub diffs +{ + local(@working); + local($diff) = "diff"; + local($rev) = ""; + + while ($_[0] =~ /^-/) { + if ($_[0] eq "-C") { + $diff .= " -c"; + shift(@_); + } elsif ($_[0] =~ /^-r/) { + $rev = $_[0]; + shift(@_); + } elsif ($_[0] eq "-sdiff") { + $TIOCGWINSZ = 1074295912; # IRIX 5.x, 6.x, and SunOS 4.x. Cool. + $buf = "abcd"; + if (ioctl(STDIN, $TIOCGWINSZ, $buf)) { + ($row, $col) = unpack("ss", $buf); + $wid = $col; + $row = 1 if 0; # lint + } else { + $wid = 80; + } + $diff = "sdiff -w$wid"; + shift(@_); + } else { + $diff .= " $_[0]"; + shift(@_); + } + + } + + @working = &working(@_); + for ($i = 0; $i <= $#_; ++$i) { + # No working file? + if (!-f $working[$i]) { + warn "No working file $working[$i] for $_[$i]\n" + if $debug; + next; + } + + # Read only? Skip. + next unless (-w $working[$i]); + + # Show the changes + select(STDOUT); $| = 1; + print "\n------ $working[$i]$rev ------\n"; + $| = 0; + # XXX - flush stdout. + if ($diff =~ /^sdiff/) { + system "co -q -p -kkvl $rev $_[$i] > /tmp/sdiff.$$" . + "&& $diff /tmp/sdiff.$$ $working[$i]"; + # XXX - interrupts? + unlink("/tmp/sdiff.$$"); + } else { + system "co -q -p -kkvl $rev $_[$i] |" . + " $diff - $working[$i]"; + } + } + + $exit = 0; +} + +# delta - check in the files +sub delta +{ + local($description) = (""); + local($i, @working); + + @working = &working(@_); + for ($i = 0; $i <= $#_; ++$i) { + # Options... + if ($_[$i] =~ /^-[yd]/) { + ($description = $_[$i]) =~ s/^-[yd]/-m/; + $description = "'" . $description . "'"; + $_[$i] = ""; + next; + } + $_[$i] = "-q" if $_[$i] eq "-s"; + $_[$i] = "" unless -f $working[$i]; + } + $exit = 0; + warn "ci $description @_\n" if $verbose; + system "ci $description @_"; +} + +# Allow RCS interface ci +sub ci +{ + &delta(@_); +} + +# delget +sub delget +{ + &delta(@_); + &get(@_); # If there was a description, delta nuked it... +} + +# deledit +sub deledit +{ + &delta(@_); + &get("-e", @_); # If there was a description, delta nuked it... +} + + +# info - who is editing what +sub info +{ + local(@working); + + @working = &working(@_); + for ($i = 0; $i <= $#_; $i++) { + open(RCS, $_[$i]); + while (defined($r = <RCS>)) { + last if $r =~ /locks/; + } + @locks = (); + while (defined($r = <RCS>)) { + # XXX - I use "comment" a delimter. + last if $r =~ /comment/; + $r =~ s/^\s+//; + chop($r); + push(@locks, $r); + } + close(RCS); + if ($#locks > -1) { + warn "$working[$i]: being edited: @locks\n"; + } + } + $exit = 0; +} + +# Fix - fix the last change to a file +sub fix +{ + foreach $f (@_) { + next unless -f $f; + open(F, $f); + while (defined(<F>)) { last if /head\s\d/; } close(F); + unless ($_ && /head/) { + warn "$0 $cmd: No head node found in $f\n"; + next; + } + s/head\s+//; chop; chop; $rev = $_; + ($working = $f) =~ s/,v//; + $working =~ s|RCS/||; + system "co -q $f && rcs -o$rev $f && rcs -l $f && chmod +w $working"; + } + $exit = 0; +} + +# print - print the history and the latest revision of the file +sub print +{ + local($file); + + foreach $file (@_) { + &history($file); + &get("-s", "-p", $file); + } + $exit = 0; +} + + +# Example - example sub command +# -Q change this option to -q just to show how. +sub example +{ + local($arg, $working); + + foreach $arg (@_) { + # Options... + $arg = "-Q" if ($arg eq "-q"); + } + warn "rlog @_\n" if $verbose; + system "rlog @_"; + $exit = 0; +} + diff --git a/performance/lmbench3/scripts/results b/performance/lmbench3/scripts/results new file mode 100755 index 0000000..cd07c15 --- /dev/null +++ b/performance/lmbench3/scripts/results @@ -0,0 +1,39 @@ +#!/bin/sh + +# $Id$ + +OS=`../scripts/os` +CONFIG=`../scripts/config` +RESULTS=results/$OS +BASE=../$RESULTS/`uname -n` +EXT=0 + +if [ ! -f "../bin/$OS/$CONFIG" ] +then echo "No config file?" + exit 1 +fi +. ../bin/$OS/$CONFIG + +if [ ! -d ../$RESULTS ] +then mkdir -p ../$RESULTS +fi +RESULTS=$BASE.$EXT +while [ -f $RESULTS ] +do EXT=`expr $EXT + 1` + RESULTS=$BASE.$EXT +done + +cd ../bin/$OS +PATH=.:${PATH}; export PATH +export SYNC_MAX +export OUTPUT +lmbench $CONFIG 2>../${RESULTS} + +if [ X$MAIL = Xyes ] +then echo Mailing results + (echo ---- $INFO --- + cat $INFO + echo ---- $RESULTS --- + cat ../$RESULTS) | mail lmbench3@xxxxxxxxxxxx +fi +exit 0 diff --git a/performance/lmbench3/scripts/save b/performance/lmbench3/scripts/save new file mode 100755 index 0000000..cf61997 --- /dev/null +++ b/performance/lmbench3/scripts/save @@ -0,0 +1,26 @@ +# Save the input in the specified file if possible. If the file exists, +# add a numeric suffice, i.e., .1, and increment that until the file +# does not exist. Use the first name found as the file to save. +# +# Typical usage is: xroff -man -fH *.1 | save MAN.PS +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: save 1.4 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $ +eval 'exec perl -Ssw $0 "$@"' + if 0; + +$base = $#ARGV == 0 ? shift : "save"; +$file = $base; +$ext = 1; + +while (-e $file) { + $file = "$base.$ext"; + $ext++; +} +warn "Saving in $file\n"; +open(FD, ">$file"); +while(<>) { + print FD; +} +exit 0; diff --git a/performance/lmbench3/scripts/stats b/performance/lmbench3/scripts/stats new file mode 100755 index 0000000..0b60667 --- /dev/null +++ b/performance/lmbench3/scripts/stats @@ -0,0 +1,50 @@ + +# Convert the Y coordinate to an average + +eval "exec perl -sS $0 $*" + if 0; + +@values = (); +$sum = $n = 0; +$min = 1.7E+308; +$max = 2.2E-308; +while (<>) { + next if /^[%#]/; + split; + if ($_[0] > 1000000) { + #warn "$file: ignoring $_"; + next; + } + if ($#_ >= 1) { + $val = $_[1]; + } else { + $val = $_[0]; + } + push(@values, $val); + $sum += $val; + $min = $val if $val < $min; + $max = $val if $val > $max; + $n++; +} +# Do some statistics. +@s = sort(@values); +if ($n & 1) { + $median = $s[($n + 1)/2]; +} else { + $i = $n / 2; + $median = ($s[$i] + $s[$i+1]) / 2; +} +$avg = $sum/$n; +$avgdev = $var = 0; +foreach $_ (@values) { + $var += ($_ - $median) ** 2; + $tmp = $_ - $median; + $avgdev += $tmp > 0 ? $tmp : -$tmp; +} +$var /= $n - 1; +$stddev = sqrt($var); +$avgdev /= $n; +#printf("%8s %8s %8s %8s %8s %4s %8s\n", "Min", "Max", "Average", "Median", "Std Dev", "%", "Avg Dev"); +#printf "%8.2f %8.2f %8.2f %8.2f %8.2f %4.1f%% %8.2f\n", $min, $max, $avg, $median, $stddev, $stddev/$median*100, $avgdev; +printf "%4.1f%%\n", $stddev/$median*100; +exit 0; diff --git a/performance/lmbench3/scripts/statsummary b/performance/lmbench3/scripts/statsummary new file mode 100755 index 0000000..21e6266 --- /dev/null +++ b/performance/lmbench3/scripts/statsummary @@ -0,0 +1,1075 @@ + +# Generate an ascii summary from lmbench result files BY HOSTNAME +# instead of architecture. Sorry, I think of these tools as being +# used to measure and prototype particular named systems, not as +# being useful to measure once and for all "i686-linux" systems, +# which might well have different motherboards, chipsets, memory +# clocks, CPU's (anything from PPro through to PIII so far) and +# so forth. Linux systems are far to heterogeneous to be categorized +# with two or three descriptors, so might as well just use hostname +# for shorthand... +# +# Usage: statsummary file file file... +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# +# $Id: statsummary,v 1.5 2000/07/08 21:06:49 rgb Exp $ +# +# +# Edit History. I'm starting out with Larry's getsummary. Then I'm +# going to splice in a very simple set of stats routines that are +# passed an array in his standard form and return a structure containing +# max, min, mean, median, unbiased standard deviation and we'll go from +# there. However I'll likely print out only mean and SD and will try +# to preserve Larry's general layout at that. Oh, and I'm going to add +# COMMENTS to the script. Drives me nuts to work on something without +# comments. 7/6/00 + +eval 'exec perl -Ssw $0 "$@"' + if 0; + +# +# This segment loops through all the output files and pushes the +# specific field values it needs into suitably named arrays. It +# counts while it does so so it can check to be sure that all +# the input files are complete. +$n = 0; +@hosts = (); +foreach $file (@ARGV) { + open(FD, $file) || die "$0: can't open $file"; + # I just want @file to contain the hostname, not the path or architecture. + # However, we have reason to need the associated filename (no path) to + # to help with debugging. + # Strip off the path + $file =~ s/(.*)\///; + # Split the filename from the number. This will probably break if the + # hostname contains more "."'s. However, I'm too lazy to figure out + # how to make this work totally robustly. It would be easy if the + # the host datafiles were all created according to the "hostname.count" + # format, because then a simple regexp would pull off just the hostname + # or the count. Not so easy when a hostname/count might contain no "."'s + # at all... + $filecount = ""; + ($file,$filecount) = split(/\./,$file); + # fix silly bug caused by starting numbering at blank. + if(! $filecount){ + $filecount = 0; + } + # Debugging... + # print STDERR "Found file $file with count $filecount\n"; + push(@file, $file); + push(@filecount, $filecount); + + # This should just push UNIQUE new hosts onto @hosts. + $numhosts = @hosts; + if($numhosts){ + $lasthost = $hosts[$numhosts-1]; + } else { + $lasthost = ""; + } + if($lasthost !~ /$file/){ + push(@hosts, $file); + } + + $mhz = 0; + while (<FD>) { + chop; + next if m|scripts/lmbench: /dev/tty|; + if (/^\[lmbench/) { + push(@uname, $_); + if (/lmbench1\./) { + $version = 1; + } else { + $version = 2; + } + } + if (/MHZ/ && !$mhz) { + @_ = split; + $_[1] =~ s/\]//; + push(@misc_mhz, $_[1]); + $mhz = 1; + } elsif (/Mhz/ && !$mhz) { + @_ = split; + push(@misc_mhz, $_[0]); + $mhz = 1; + } + if (/^Select on 100 fd/) { + @_ = split; + push(@lat_select, $_[4]); + $tmp = $lat_select[0]; # Just to shut up the error parser + } + if (/^Simple syscall:/) { + @_ = split; + push(@lat_syscall, $_[2]); + $tmp = $lat_syscall[0]; # Just to shut up the error parser + } + if (/^Simple read:/) { + @_ = split; + push(@lat_read, $_[2]); + $tmp = $lat_read[0]; # Just to shut up the error parser + } + if (/^Simple write:/) { + @_ = split; + push(@lat_write, $_[2]); + $tmp = $lat_write[0]; # Just to shut up the error parser + } + if (/^Simple stat:/) { + @_ = split; + push(@lat_stat, $_[2]); + $tmp = $lat_stat[0]; # Just to shut up the error parser + } + if (/^Simple open.close:/) { + @_ = split; + push(@lat_openclose, $_[2]); + $tmp = $lat_openclose[0]; # Just to shut up the error parser + } + if (/^Null syscall:/) { # Old format. + @_ = split; + push(@lat_write, $_[2]); + $tmp = $lat_write[0]; # Just to shut up the error parser + } + if (/^Signal handler installation:/) { + @_ = split; + push(@lat_siginstall, $_[3]); + $tmp = $lat_siginstall[0]; # Just to shut up the error parser + } + if (/^Signal handler overhead:/) { + @_ = split; + push(@lat_sigcatch, $_[3]); + $tmp = $lat_sigcatch[0]; # Just to shut up the error parser + } + if (/^Protection fault:/) { + @_ = split; + push(@lat_protfault, $_[2]); + $tmp = $lat_protfault[0]; # Just to shut up the error parser + } + if (/^Pipe latency:/) { + @_ = split; + push(@lat_pipe, $_[2]); + $tmp = $lat_pipe[0]; # Just to shut up the error parser + } + if (/AF_UNIX sock stream latency:/) { + @_ = split; + push(@lat_unix, $_[4]); + $tmp = $lat_unix[0]; # Just to shut up the error parser + } + if (/^UDP latency using /) { + if(/localhost:/) { + @_ = split; + push(@lat_udp_local, $_[4]); + $tmp = $lat_udp_local[0]; # Just to shut up the error parser + } else { + @_ = split; + push(@lat_udp_net, $_[4]); + $tmp = $lat_udp_net[0]; # Just to shut up the error parser + } + } + if (/^TCP latency using /) { + if(/localhost:/) { + @_ = split; + push(@lat_tcp_local, $_[4]); + $tmp = $lat_tcp_local[0]; # Just to shut up the error parser + } else { + @_ = split; + push(@lat_tcp_net, $_[4]); + $tmp = $lat_tcp_net[0]; # Just to shut up the error parser + } + } + if (/^RPC\/udp latency using /) { + if(/localhost:/) { + @_ = split; + push(@lat_rpc_udp_local, $_[4]); + $tmp = $lat_rpc_udp_local[0]; # Just to shut up the error parser + } else { + @_ = split; + push(@lat_rpc_udp_net, $_[4]); + $tmp = $lat_rpc_udp_net[0]; # Just to shut up the error parser + } + } + if (/^RPC\/tcp latency using /) { + if(/localhost:/) { + @_ = split; + push(@lat_rpc_tcp_local, $_[4]); + $tmp = $lat_rpc_tcp_local[0]; # Just to shut up the error parser + } else { + @_ = split; + push(@lat_rpc_tcp_net, $_[4]); + $tmp = $lat_rpc_tcp_net[0]; # Just to shut up the error parser + } + } + if (/^TCP\/IP connection cost to /) { + if(/localhost:/) { + @_ = split; + push(@lat_tcp_connect_local, $_[5]); + $tmp = $lat_tcp_connect_local[0]; # Just to shut up the error parser + } else { + @_ = split; + push(@lat_tcp_connect_net, $_[5]); + $tmp = $lat_tcp_connect_net[0]; # Just to shut up the error parser + } + } + if (/^Socket bandwidth using /) { + if(/localhost:/) { + @_ = split; + push(@bw_tcp_local, $_[4]); + $tmp = $bw_tcp_local[0]; # Just to shut up the error parser + } else { + @_ = split; + push(@bw_tcp_net, $_[4]); + $tmp = $bw_tcp_net[0]; # Just to shut up the error parser + } + } + if (/^AF_UNIX sock stream bandwidth:/) { + @_ = split; + push(@bw_unix, $_[4]); + $tmp = $bw_unix[0]; # Just to shut up the error parser + } + if (/^Process fork.exit/) { + @_ = split; + push(@lat_nullproc, $_[2]); + $tmp = $lat_nullproc[0]; # Just to shut up the error parser + } + if (/^Process fork.execve:/) { + @_ = split; + push(@lat_simpleproc, $_[2]); + $tmp = $lat_simpleproc[0]; # Just to shut up the error parser + } + if (/^Process fork..bin.sh/) { + @_ = split; + push(@lat_shproc, $_[3]); + $tmp = $lat_shproc[0]; # Just to shut up the error parser + } + if (/^Pipe bandwidth/) { + @_ = split; + push(@bw_pipe, $_[2]); + $tmp = $bw_pipe[0]; # Just to shut up the error parser + } + if (/^File .* write bandwidth/) { + @_ = split; + $bw = sprintf("%.2f", $_[4] / 1024.); + push(@bw_file, $bw); + $tmp = $bw_file[0]; # Just to shut up the error parser + } + if (/^Pagefaults on/) { + @_ = split; + push(@lat_pagefault, $_[3]); + $tmp = $lat_pagefault[0]; # Just to shut up the error parser + } + if (/^"mappings/) { + $value = &getbiggest("memory mapping timing"); + push(@lat_mappings, $value); + $tmp = $lat_mappings[0]; # Just to shut up the error parser + } + if (/^"read bandwidth/) { + $value = &getbiggest("reread timing"); + push(@bw_reread, $value); + $tmp = $bw_reread[0]; # Just to shut up the error parser + } + if (/^"Mmap read bandwidth/) { + $value = &getbiggest("mmap reread timing"); + push(@bw_mmap, $value); + $tmp = $bw_mmap[0]; # Just to shut up the error parser + } + if (/^"libc bcopy unaligned/) { + $value = &getbiggest("libc bcopy timing"); + push(@bw_bcopy_libc, $value); + $tmp = $bw_bcopy_libc[0]; # Just to shut up the error parser + } + if (/^"unrolled bcopy unaligned/) { + $value = &getbiggest("unrolled bcopy timing"); + push(@bw_bcopy_unrolled, $value); + $tmp = $bw_bcopy_unrolled[0]; # Just to shut up the error parser + } + if (/^Memory read/) { + $value = &getbiggest("memory read & sum timing"); + push(@bw_mem_rdsum, $value); + $tmp = $bw_mem_rdsum[0]; # Just to shut up the error parser + } + if (/^Memory write/) { + $value = &getbiggest("memory write timing"); + push(@bw_mem_wr, $value); + $tmp = $bw_mem_wr[0]; # Just to shut up the error parser + } + if (/^"File system latency/) { + while (<FD>) { + next if /Id:/; + if (/^0k/) { + @_ = split; + push(@fs_create_0k, $_[2]); + push(@fs_delete_0k, $_[3]); + $tmp = $fs_create_0k[0]; # Just to shut up the error parser + $tmp = $fs_delete_0k[0]; # Just to shut up the error parser + } elsif (/^1k/) { + @_ = split; + push(@fs_create_1k, $_[2]); + push(@fs_delete_1k, $_[3]); + $tmp = $fs_create_1k[0]; # Just to shut up the error parser + $tmp = $fs_delete_1k[0]; # Just to shut up the error parser + } elsif (/^4k/) { + @_ = split; + push(@fs_create_4k, $_[2]); + push(@fs_delete_4k, $_[3]); + $tmp = $fs_create_4k[0]; # Just to shut up the error parser + $tmp = $fs_delete_4k[0]; # Just to shut up the error parser + } elsif (/^10k/) { + @_ = split; + push(@fs_create_10k, $_[2]); + push(@fs_delete_10k, $_[3]); + $tmp = $fs_create_10k[0]; # Just to shut up the error parser + $tmp = $fs_delete_10k[0]; # Just to shut up the error parser + } else { + last; + } + } + } + if (/size=0/) { + while (<FD>) { + if (/^2 /) { + @_ = split; push(@lat_ctx0_2, $_[1]); + $tmp = $lat_ctx0_2[0]; # Just to shut up the error parser + } elsif (/^8 /) { + @_ = split; push(@lat_ctx0_8, $_[1]); + $tmp = $lat_ctx0_8[0]; # Just to shut up the error parser + } elsif (/^16 /) { + @_ = split; push(@lat_ctx0_16, $_[1]); + $tmp = $lat_ctx0_16[0]; # Just to shut up the error parser + } + last if /^\s*$/ || /^Memory/; + } + } + if (/size=16/) { + while (<FD>) { + if (/^2 /) { + @_ = split; push(@lat_ctx16_2, $_[1]); + $tmp = $lat_ctx16_2[0]; # Just to shut up the error parser + } elsif (/^8 /) { + @_ = split; push(@lat_ctx16_8, $_[1]); + $tmp = $lat_ctx16_8[0]; # Just to shut up the error parser + } elsif (/^16 /) { + @_ = split; push(@lat_ctx16_16, $_[1]); + $tmp = $lat_ctx16_16[0]; # Just to shut up the error parser + } + last if /^\s*$/; + } + } + if (/size=64/) { + while (<FD>) { + if (/^2 /) { + @_ = split; push(@lat_ctx64_2, $_[1]); + $tmp = $lat_ctx64_2[0]; # Just to shut up the error parser + } elsif (/^8 /) { + @_ = split; push(@lat_ctx64_8, $_[1]); + $tmp = $lat_ctx64_8[0]; # Just to shut up the error parser + } elsif (/^16 /) { + @_ = split; push(@lat_ctx64_16, $_[1]); + $tmp = $lat_ctx64_16[0]; # Just to shut up the error parser + } + last if /^\s*$/ || /^20/; + } + } + if (/^"stride=128/) { + $save = -1; + while (<FD>) { + if (/^0.00098\s/) { + @_ = split; + push(@lat_l1, $_[1]); + $tmp = $lat_l1[0]; # Just to shut up the error parser + } elsif (/^0.12500\s/) { + @_ = split; + push(@lat_l2, $_[1]); + $tmp = $lat_l2[0]; # Just to shut up the error parser + } elsif (/^[45678].00000\s/) { + @_ = split; + $size = $_[0]; + $save = $_[1]; + last if /^8.00000\s/; + } elsif (/^\s*$/) { + last; + } + } + if (!/^8/) { + warn "$file: No 8MB memory latency, using $size\n"; + } + push(@lat_mem, $save); + } + } + @warn = (); + foreach $array ( + 'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_file', + 'bw_mem_rdsum', 'bw_mem_wr', 'bw_mmap', 'bw_pipe', + 'bw_reread', 'bw_tcp_local', 'bw_unix', + 'fs_create_0k','fs_delete_0k', + 'fs_create_1k','fs_delete_1k', + 'fs_create_4k','fs_delete_4k', + 'fs_create_10k','fs_delete_10k', + 'lat_ctx0_16', 'lat_ctx0_2', 'lat_ctx0_8', + 'lat_ctx16_16', 'lat_ctx16_2', 'lat_ctx16_8', + 'lat_ctx64_16', 'lat_ctx64_2', 'lat_ctx64_8', 'lat_l1', + 'lat_l2', 'lat_mappings', 'lat_mem', 'lat_nullproc', + 'lat_openclose', 'lat_pagefault', 'lat_pipe', + 'lat_protfault', 'lat_read', + 'lat_rpc_tcp_local','lat_rpc_udp_local', + 'lat_tcp_connect_local', 'lat_tcp_local', 'lat_udp_local', + 'lat_rpc_tcp_net','lat_rpc_udp_net', + 'lat_tcp_connect_net', 'lat_tcp_net', 'lat_udp_net', + 'lat_select', 'lat_shproc', 'lat_sigcatch', + 'lat_siginstall', 'lat_simpleproc', 'lat_stat', + 'lat_syscall', 'lat_unix', 'lat_write', 'misc_mhz', + ) { + $last = eval '$#' . $array; + if ($last != $n) { + #warn "No data for $array in $file\n"; + push(@warn, $array); + eval 'push(@' . $array . ', -1);'; + } + } + if ($#warn != -1) { + warn "Missing data in $file: @warn\n"; + } + $n++; +} + +# +# OK, now all those arrays are packed. Because everything is keyed +# on raw hostname, we can do all the stats evaluations using a combination +# of @file and the array -- we march through @file and create a stats +# object (a % hash) with its name and do the obvious sums and so forth. +# should be very simple. +# +# However, to be fair to Larry, we do want to preserve the general flavor +# of the summary. However, the summary is now going to be output BY HOST +# and so we need a separate host-description section for each host. +# +# First we have to evaluate the stats, though. +# + +# +# Let's test this with just one small set of values... +foreach $array ( + 'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_file', + 'bw_mem_rdsum', 'bw_mem_wr', 'bw_mmap', 'bw_pipe', + 'bw_reread', 'bw_tcp_local', 'bw_unix', + 'fs_create_0k','fs_delete_0k', + 'fs_create_1k','fs_delete_1k', + 'fs_create_4k','fs_delete_4k', + 'fs_create_10k','fs_delete_10k', + 'lat_l1', + 'lat_l2', 'lat_mappings', 'lat_mem', 'lat_nullproc', + 'lat_openclose', 'lat_pagefault', 'lat_pipe', + 'lat_protfault', 'lat_read', + 'lat_rpc_tcp_local','lat_rpc_udp_local', + 'lat_tcp_connect_local', 'lat_tcp_local', 'lat_udp_local', + 'lat_rpc_tcp_net','lat_rpc_udp_net', + 'lat_tcp_connect_net', 'lat_tcp_net', 'lat_udp_net', + 'lat_select', 'lat_shproc', 'lat_sigcatch', + 'lat_siginstall', 'lat_simpleproc', 'lat_stat', + 'lat_syscall', 'lat_unix', 'lat_write', 'misc_mhz', + ) { } # Empty just to save the full list someplace handy. + +# +# Oops. For some unfathomable reason lat_fs returns something other than +# an (average) time in nanoseconds. Why, I cannot imagine -- one could +# trivially invert so that it did so. One CANNOT DO STATS on inverse +# quantities, so we invert here and convert to nanoseconds +# so we can correctly do stats below. +foreach $array ( + 'fs_create_0k','fs_delete_0k','fs_create_1k','fs_delete_1k', + 'fs_create_4k','fs_delete_4k','fs_create_10k','fs_delete_10k', + ) { + $cnt = 0; + foreach $entry (@$array){ + $$array[$cnt++] = 1.0e+9/$entry; + } + +} + +# Working copy. Let's just add things as they turn out to be +# appropriate. In fact, we'll add them in presentation order! +foreach $array ( + 'lat_syscall','lat_read', 'lat_write', 'lat_syscall', 'lat_stat', + 'lat_openclose','lat_select','lat_siginstall','lat_sigcatch', + 'lat_nullproc','lat_simpleproc','lat_shproc', + 'lat_ctx0_2','lat_ctx0_16','lat_ctx0_8', + 'lat_ctx16_16','lat_ctx16_2','lat_ctx16_8', + 'lat_ctx64_16','lat_ctx64_2','lat_ctx64_8', + 'lat_pipe','lat_unix', + 'lat_udp_local','lat_tcp_local',lat_tcp_connect_local, + 'lat_rpc_udp_local','lat_rpc_tcp_local', + 'lat_udp_net','lat_tcp_net',lat_tcp_connect_net, + 'lat_rpc_udp_net','lat_rpc_tcp_net', + 'fs_create_0k','fs_delete_0k', + 'fs_create_1k','fs_delete_1k', + 'fs_create_4k','fs_delete_4k', + 'fs_create_10k','fs_delete_10k', + 'lat_mappings','lat_protfault','lat_pagefault', + 'bw_pipe','bw_unix', + 'bw_tcp_local', # Note we need bw_udp_local as soon as it exists... + 'bw_reread','bw_mmap','bw_bcopy_libc','bw_bcopy_unrolled', + 'bw_mem_rdsum','bw_mem_wr', + 'bw_tcp_net', + 'lat_l1','lat_l2','lat_mem', + ) { + + # + # This should do it all, by name and collapsed by hostname + # + makestats($array); + +} + +# +# Fine, that seems to work. Now we break up the summary, BY HOST. +# For each host we print just ONE TIME key values that don't really +# vary (like its architecture information and clock). Then we print +# out a modified version of Larry's old summary. +# + +# +# First the header +# +print<<EOF; +======================================================================== + + L M B E N C H 3 . 0 S U M M A R Y + ------------------------------------ + +======================================================================== + +EOF + +# +# Now a host loop. Notice that @hosts is a list of hosts +# +$numhosts = @hosts; +for($i=0;$i<$numhosts;$i++){ + $host = $hosts[$i]; + # Obviously we need a better way to fill in this information. + # Linux provides /proc/cpuinfo, which is just perfect and trivial + # to parse. However, we should probably read this in from e.g. + # config/$host.conf, which can be created either automagically or + # by hand. This file should also be used to control the running + # of the benchmark suite, which in turn should be done by means of + # a script call, not a make target. I'm getting there... + # + # Oh, one last note. It would be VERY CONVENIENT to have the config + # information stored in perl. So convenient that the following should + # BE the format of the config file... (up to the next comment) + $CPU = "Celeron(Mendocino)"; + $CPUFAMILY = "i686"; + $MHz = 400; + $L1CODE = 16; + $L1DATA = 16; + $L2SIZE = 128; + $memsize = 128; + $memspeed = "PC100"; + $memtype = "SDRAM"; + @DISKS = ("/dev/hda","/dev/hdb","/dev/hdc"); + @DISKTYPE = ("IBM-DJNA-371350, ATA DISK drive", "Disk 2", "Disk etc."); + @NETWORKS = ("ethernet-100","SneakerNet @ 3 meters/second"); + @NICTYPE = ("Lite-On 82c168 PNIC rev 32","Nike Sports (etc.)"); + @NETHUB = ("Netgear FS108 Fast Ethernet Switch","The Floor"); + # + # OK, given this wealth of detail (which can be sourced directly into + # the perl script from the host config file if we are clever) we now + # print it into the report/summary. + # + printf("HOST:\t\t$host\n"); + printf("CPU:\t\t$CPU\n"); + printf("CPU Family:\t$CPUFAMILY\n"); + printf("MHz:\t\t$MHz\n"); + printf("L1 Cache Size:\t$L1CODE KB (code)/$L1DATA KB (data)\n"); + printf("L2 Cache Size:\t$L2SIZE KB\n"); + printf("Memory:\t\t$memsize MB of $memspeed $memtype\n"); + printf("OS Kernel:\t%13s\n",&getos($uname[0])); + printf("Disk(s):\n"); + $numdisks = @DISKS; + for($j=0;$j<$numdisks;$j++){ + printf("\t\t%d) %s: %s\n",$j+1,$DISKS[$j],$DISKTYPE[$j]); + } + printf("Network(s):\n"); + $numnets = @NETWORKS; + for($j=0;$j<$numnets;$j++){ + printf("\t\t%d) %s: %s\n",$j+1,$NETWORKS[$j],$NICTYPE[$j]); + printf("\t\t Switch/Hub: %s\n",$NETHUB[$j]); + } + print<<EOF; + + +------------------------------------------------------------------------ +Processor, Processes - average times in microseconds - smaller is better +------------------------------------------------------------------------ + null null open/ + call Error I/O Error stat Error close Error +------ ------ ------ ------ ------ ------ ------ ------ +EOF + +# +# In all the output below, averaged arrays are accessed by the hash: +# $stats{$host}{$array}{mean or stddev} (or whatever) + + @fs_delete_4k = @lat_ctx0_8 = @bw_file = @lat_ctx0_16 = @fs_delete_1k = + @fs_create_4k = @fs_create_1k + if 0; # lint + + # If they have no /dev/zero, use /dev/null, else average them. + if ($stats{$host}{lat_read}{mean} == -1) { + $lat_rw_mean = $stats{$host}{lat_write}{mean}; + $lat_rw_stddev = $stats{$host}{lat_write}{stddev}; + } else { + $lat_rw_mean = ($stats{$host}{lat_read}{mean} + $stats{$host}{lat_write}{mean})/2; + $lat_rw_stddev = ($stats{$host}{lat_read}{stddev} + $stats{$host}{lat_write}{stddev})/2; + } + # We have to pick a format adequate for these numbers. We'll shoot for + # %5.2f and see how it goes. + printf("%6.3f %6.3f ",$stats{$host}{lat_syscall}{mean},$stats{$host}{lat_syscall}{stddev}); + printf("%6.3f %6.3f ",$lat_rw_mean,$lat_rw_stddev); + printf("%6.3f %6.3f ",$stats{$host}{lat_stat}{mean},$stats{$host}{lat_stat}{stddev}); + printf("%6.3f %6.3f ",$stats{$host}{lat_openclose}{mean},$stats{$host}{lat_openclose}{stddev}); + # End with this to complete the line... + printf("\n"); + print<<EOF; +........................................................................ + signal signal +select Error instll Error catch Error +------ ------ ------ ------ ------ ------ +EOF + printf("%6.1f %6.2f ",$stats{$host}{lat_select}{mean},$stats{$host}{lat_select}{stddev}); + printf("%6.3f %6.3f ",$stats{$host}{lat_siginstall}{mean},$stats{$host}{lat_siginstall}{stddev}); + printf("%6.3f %6.3f ",$stats{$host}{lat_sigcatch}{mean},$stats{$host}{lat_sigcatch}{stddev}); + # End with this to complete the line... + printf("\n"); + print<<EOF; +........................................................................ + fork exec shell + proc Error proc Error proc Error +------- ------- ------- ------- ------- ------- +EOF + printf("%7.1f %7.2f ", + $stats{$host}{lat_nullproc}{mean},$stats{$host}{lat_nullproc}{stddev}); + printf("%7.1f %7.2f ", + $stats{$host}{lat_simpleproc}{mean},$stats{$host}{lat_simpleproc}{stddev}); + printf("%7.1f %7.2f ", + $stats{$host}{lat_shproc}{mean},$stats{$host}{lat_shproc}{stddev}); + # End with this to complete the line... + printf("\n"); + print<<EOF; + + +------------------------------------------------------------------------ +Context switching - times in microseconds - smaller is better +------------------------------------------------------------------------ +2p/0K 2p/16K 2p/64K + Error Error Error +------ ------ ------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{lat_ctx0_2}{mean},$stats{$host}{lat_ctx0_2}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_ctx16_2}{mean},$stats{$host}{lat_ctx16_2}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_ctx64_2}{mean},$stats{$host}{lat_ctx64_2}{stddev}); + # End with this to complete the line... + printf("\n"); + print<<EOF; +........................................................................ +8p/0K 8p/16K 8p/64K + Error Error Error +------ ------ ------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{lat_ctx0_8}{mean},$stats{$host}{lat_ctx16_8}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_ctx16_8}{mean},$stats{$host}{lat_ctx16_8}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_ctx64_8}{mean},$stats{$host}{lat_ctx64_8}{stddev}); + # End with this to complete the line... + printf("\n"); + print<<EOF; +........................................................................ +16p/0K 16p/16K 16p/64K + Error Error Error +------ ------ ------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{lat_ctx0_16}{mean},$stats{$host}{lat_ctx0_16}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_ctx16_16}{mean},$stats{$host}{lat_ctx16_16}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_ctx64_16}{mean},$stats{$host}{lat_ctx64_16}{stddev}); + # End with this to complete the line... + printf("\n"); + print<<EOF; + +------------------------------------------------------------------------ +*Local* Communication latencies in microseconds - smaller is better +------------------------------------------------------------------------ + Pipe AF + Error UNIX Error +------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{lat_pipe}{mean},$stats{$host}{lat_pipe}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_unix}{mean},$stats{$host}{lat_unix}{stddev}); + printf("\n"); + print<<EOF; +........................................................................ + UDP TCP TCP + Error Error Connect Error +------ ------ ------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{lat_udp_local}{mean},$stats{$host}{lat_udp_local}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_tcp_local}{mean},$stats{$host}{lat_tcp_local}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_tcp_connect_local}{mean},$stats{$host}{lat_tcp_connect_local}{stddev}); + printf("\n"); + print<<EOF; +........................................................................ + RPC RPC + UDP Error TCP Error +------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{lat_rpc_udp_local}{mean},$stats{$host}{lat_rpc_udp_local}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_rpc_tcp_local}{mean},$stats{$host}{lat_rpc_tcp_local}{stddev}); + printf("\n"); + print<<EOF; + +------------------------------------------------------------------------ +*Network* Communication latencies in microseconds - smaller is better +------------------------------------------------------------------------ + UDP TCP TCP + Error Error Connect Error +------ ------ ------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{lat_udp_net}{mean},$stats{$host}{lat_udp_net}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_tcp_net}{mean},$stats{$host}{lat_tcp_net}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_tcp_connect_net}{mean},$stats{$host}{lat_tcp_connect_net}{stddev}); + printf("\n"); + print<<EOF; +........................................................................ + RPC RPC + UDP Error TCP Error +------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{lat_rpc_udp_net}{mean},$stats{$host}{lat_rpc_udp_net}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_rpc_tcp_net}{mean},$stats{$host}{lat_rpc_tcp_net}{stddev}); + printf("\n"); + print<<EOF; + +------------------------------------------------------------------------ +File & VM system latencies in microseconds - smaller is better +------------------------------------------------------------------------ + 0k File 1K File +Create Error Delete Error Create Error Delete Error +------- ------- ------- ------- ------- ------- ------- ------- +EOF + $c0k = $stats{$host}{fs_create_0k}{mean} <= 0 ? -1 : $stats{$host}{fs_create_0k}{mean}/1000; + $c0kerr = $stats{$host}{fs_create_0k}{stddev} <= 0 ? -1 : $stats{$host}{fs_create_0k}{stddev}/1000; + $d0k = $stats{$host}{fs_delete_0k}{mean} <= 0 ? -1 : $stats{$host}{fs_delete_0k}{mean}/1000; + $d0kerr = $stats{$host}{fs_delete_0k}{stddev} <= 0 ? -1 : $stats{$host}{fs_delete_0k}{stddev}/1000; + $c1k = $stats{$host}{fs_create_1k}{mean} <= 0 ? -1 : $stats{$host}{fs_create_1k}{mean}/1000; + $c1kerr = $stats{$host}{fs_create_1k}{stddev} <= 0 ? -1 : $stats{$host}{fs_create_1k}{stddev}/1000; + $d1k = $stats{$host}{fs_delete_1k}{mean} <= 0 ? -1 : $stats{$host}{fs_delete_1k}{mean}/1000; + $d1kerr = $stats{$host}{fs_delete_1k}{stddev} <= 0 ? -1 : $stats{$host}{fs_delete_1k}{stddev}/1000; + printf("%7.2f %7.3f ", + $c0k,$c0kerr); + printf("%7.2f %7.3f ", + $d0k,$d0kerr); + printf("%7.2f %7.3f ", + $c1k,$c1kerr); + printf("%7.2f %7.3f ", + $d1k,$d1kerr); + printf("\n"); + print<<EOF; +........................................................................ + 4k File 10K File +Create Error Delete Error Create Error Delete Error +------- ------- ------- ------- ------- ------- ------- ------- +EOF + $c4k = $stats{$host}{fs_create_4k}{mean} <= 0 ? -1 : $stats{$host}{fs_create_4k}{mean}/1000; + $c4kerr = $stats{$host}{fs_create_4k}{stddev} <= 0 ? -1 : $stats{$host}{fs_create_4k}{stddev}/1000; + $d4k = $stats{$host}{fs_delete_4k}{mean} <= 0 ? -1 : $stats{$host}{fs_delete_4k}{mean}/1000; + $d4kerr = $stats{$host}{fs_delete_4k}{stddev} <= 0 ? -1 : $stats{$host}{fs_delete_4k}{stddev}/1000; + $c10k = $stats{$host}{fs_create_10k}{mean} <= 0 ? -1 : $stats{$host}{fs_create_10k}{mean}/1000; + $c10kerr = $stats{$host}{fs_create_10k}{stddev} <= 0 ? -1 : $stats{$host}{fs_create_10k}{stddev}/1000; + $d10k = $stats{$host}{fs_delete_10k}{mean} <= 0 ? -1 : $stats{$host}{fs_delete_10k}{mean}/1000; + $d10kerr = $stats{$host}{fs_delete_10k}{stddev} <= 0 ? -1 : $stats{$host}{fs_delete_10k}{stddev}/1000; + printf("%7.2f %7.3f ", + $c4k,$c4kerr); + printf("%7.2f %7.3f ", + $d4k,$d4kerr); + printf("%7.2f %7.3f ", + $c10k,$c10kerr); + printf("%7.2f %7.3f ", + $d10k,$d10kerr); + printf("\n"); + print<<EOF; +........................................................................ + Mmap Prot Page +Latency Error Fault Error Fault Error +-------- -------- ------- ------- -------- -------- +EOF + printf("%8.2f %8.3f ", + $stats{$host}{lat_mappings}{mean},$stats{$host}{lat_mappings}{stddev}); + printf("%7.2f %7.3f ", + $stats{$host}{lat_protfault}{mean},$stats{$host}{lat_protfault}{stddev}); + printf("%8.2f %8.3f ", + $stats{$host}{lat_pagefault}{mean},$stats{$host}{lat_pagefault}{stddev}); + printf("\n"); + print<<EOF; + +------------------------------------------------------------------------ +*Local* Communication bandwidths in MB/s - bigger is better +------------------------------------------------------------------------ + Pipe AF + Error UNIX Error +------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{bw_pipe}{mean},$stats{$host}{bw_pipe}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{bw_unix}{mean},$stats{$host}{bw_unix}{stddev}); + printf("\n"); + print<<EOF; +........................................................................ + UDP TCP + Error Error +------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + -1,-1); + printf("%6.2f %6.3f ", + $stats{$host}{bw_tcp_local}{mean},$stats{$host}{bw_tcp_local}{stddev}); + printf("\n"); + print<<EOF; +........................................................................ + File Mmap Bcopy Bcopy +reread Error reread Error (libc) Error (hand) Error +------ ------ ------ ------ ------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{bw_reread}{mean},$stats{$host}{bw_reread}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{bw_mmap}{mean},$stats{$host}{bw_mmap}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{bw_bcopy_libc}{mean},$stats{$host}{bw_bcopy_libc}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{bw_bcopy_unrolled}{mean},$stats{$host}{bw_bcopy_unrolled}{stddev}); + printf("\n"); + print<<EOF; +........................................................................ + Mem Mem + read Error write Error +------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + $stats{$host}{bw_mem_rdsum}{mean},$stats{$host}{bw_mem_rdsum}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{bw_mem_wr}{mean},$stats{$host}{bw_mem_wr}{stddev}); + printf("\n"); + print<<EOF; + +------------------------------------------------------------------------ +*Net* Communication bandwidths in MB/s - bigger is better +------------------------------------------------------------------------ + UDP TCP + Error Error +------ ------ ------ ------ +EOF + printf("%6.2f %6.3f ", + -1,-1); + printf("%6.2f %6.3f ", + $stats{$host}{bw_tcp_net}{mean},$stats{$host}{bw_tcp_net}{stddev}); + printf("\n"); + print<<EOF; + +------------------------------------------------------------------------ +Memory latencies in nanoseconds - smaller is better + (WARNING - may not be correct, check graphs) +------------------------------------------------------------------------ + L1 L2 Main +Cache Error Cache Error mem Error Guesses +------ ------ ------ ------ ------ ------ ------- +EOF + $msg = &check_caches; + if ($stats{$host}{lat_l1}{mean} < 0) { + printf("%6s %6s ", + "------","------"); + printf("%6s %6s ", + "------","------"); + printf("%6s %6s ", + "------","------"); + printf("%6s","Bad mhz?"); + } else { + printf("%6.2f %6.3f ", + $stats{$host}{lat_l1}{mean},$stats{$host}{lat_l1}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_l2}{mean},$stats{$host}{lat_l2}{stddev}); + printf("%6.2f %6.3f ", + $stats{$host}{lat_mem}{mean},$stats{$host}{lat_mem}{stddev}); + print $msg if ($msg =~ /L/); + } + printf("\n"); + + + +# This ends the host section... + print<<EOF; + +======================================================================== +EOF + +} + +exit 0; + + +# (33, %3d) +sub num +{ + local($val, $fmt) = @_; + local($str) = ""; + local($i); + + if ($val <= 0) { + $fmt =~ s/^.//; + while (length($fmt) > 1) { chop($fmt); } + for ($i = 0; $i < $fmt; $i++) { + $str .= " "; + } + return ($str); + } + $str = sprintf($fmt, $val); + $str; +} + +# Input looks like +# "benchmark name +# size value +# .... +# <blank line> +# +# Return the biggest value before the blank line. +sub getbiggest +{ + local($msg) = @_; + local($line) = 0; + + undef $save; + $value = 0; + while (<FD>) { + $line++; + #warn "$line $_"; + last if /^\s*$/; + $save = $_ if /^\d+\./; + } + if (defined $save) { + $_ = $save; + @d = split; + $value = $d[1]; + if (int($d[0]) < 4) { + warn "$file: using $d[0] size for $msg\n"; + } + } else { + warn "$file: no data for $msg\n"; + } + $value; +} + + +# Try and create sensible names from uname -a output +sub getos +{ + local(@info); + + @info = split(/\s+/, $_[0]); + "$info[3] $info[5]"; +} + +# Return true if the values differe by less than 10% +sub same +{ + local($a, $b) = @_; + + if ($a > $b) { + $percent = (($a - $b) / $a) * 100; + } else { + $percent = (($b - $a) / $b) * 100; + } + return ($percent <= 20); +} + +sub check_caches +{ + if (!&same($lat_l1[$i], $lat_l2[$i]) && + &same($lat_l2[$i], $lat_mem[$i])) { + " No L2 cache?"; + } elsif (&same($lat_l1[$i], $lat_l2[$i])) { + " No L1 cache?"; + } +} + +sub makestats +{ + + my $cnt=0; + my $host; + # Debugging + # print STDERR "Ready to make stats for array $array\n"; + # Zero the counters + $numhosts = @hosts; + for($i=0;$i<$numhosts;$i++){ + $host = $hosts[$i]; + $stats{$host}{$array}{mean} = 0.0; + $stats{$host}{$array}{stddev} = 0.0; + $stats{$host}{$array}{count} = 0; + } + # Loop through ALL DATA. We use the hash to direct results to + # to the appropriate counters. + foreach $value (@$array){ + $host = $file[$cnt]; + if($$array[0] == -1){ + $stats{$host}{$array}{mean} = -1; + $stats{$host}{$array}{stddev} = -1; + # Debugging (and curiosity) + print STDERR "Oops. $array is empty.\n"; + return; + } + # Debugging + # print STDERR "$host/$array ($cnt): value is $value\n"; + $stats{$host}{$array}{mean} += $value; + $stats{$host}{$array}{stddev} += $value*$value; + $stats{$host}{$array}{count}++; + $cnt++; + } + for($i=0;$i<$numhosts;$i++){ + $host = $hosts[$i]; + $cnt = $stats{$host}{$array}{count}; + # Debugging Only + # print STDERR "Evaluating final mean/stddev of $cnt objects in $host/$array\n"; + if($cnt>1) { + $stats{$host}{$array}{mean} = $stats{$host}{$array}{mean} / $cnt; + $stats{$host}{$array}{stddev} = sqrt(($stats{$host}{$array}{stddev} / $cnt + - $stats{$host}{$array}{mean}*$stats{$host}{$array}{mean})/($cnt-1)); + } elsif($cnt == 1) { + # Wish one could assign "infinity". This probably breaks somewhere. + $stats{$host}{$array}{stddev} = 1.0e+1000; + } else { + # print STDERR "Error: Cannot average 0 $array results on $host\n"; + } + + # Debugging Only. + # print STDERR "$host/$array (average): $stats{$host}{$array}{mean} +/- $stats{$host}{$array}{stddev}\n"; + } + +} diff --git a/performance/lmbench3/scripts/synchronize b/performance/lmbench3/scripts/synchronize new file mode 100755 index 0000000..302db00 --- /dev/null +++ b/performance/lmbench3/scripts/synchronize @@ -0,0 +1,60 @@ +#!/bin/sh + +# %W% %@% Copyright (c) 1998 Larry McVoy. +# +# Usage: SYNC_PID=3 SYNC_MAX=20 synchronize /tmp/sync_dir +# +# Used to sync up a bunch of processes so that they can operate in lockstep +# as much as possible. +# +# The first process to try and sync will mkdir(pathname) and create a named +# pipe in the directory. It also creates a file, pathname/$PID where pid +# is not the process id, it is the process number. The group of processes +# must be numbered from 1..N and they must each know their number. The Nth +# process is the master. Whereas all the other processes block, opening the +# pipe, the master spins in a loop, waiting for pathname/1 .. pathname/N-1 +# to show up in the directory. When they are all there, the master opens +# the pipe for writing and all the other processes get woken up and leave. +# +# It is outside of this program, but the directory must not exist before the +# synchronization. So you typically rm -rf it well before trying to sync. + +if [ X$1 = X ]; then echo "Usage: $0 pathname"; exit 1; fi +if [ X$SYNC_PID = X ]; then echo "Must set SYNC_PID"; exit 1; fi +if [ X$SYNC_MAX = X ]; then echo "Must set SYNC_MAX"; exit 1; fi + +DIR=$1 +mkdir -p $DIR 2>/dev/null +if [ ! -e $DIR/fifo ] +then mkfifo $DIR/fifo 2>/dev/null + chmod 666 $DIR/fifo 2>/dev/null +fi + +# slaves just read the pipe +if [ $SYNC_PID != $SYNC_MAX ] +then touch $DIR/$SYNC_PID + read x < $DIR/fifo + exit 0 +fi + +# Master waits for all the other processes to get there +PIDS="" +I=1 +while [ $I -lt $SYNC_MAX ] +do PIDS=" $I$PIDS" + I=`expr $I + 1` +done +while true +do GO=Y + for s in $PIDS + do if [ ! -e $DIR/$s ] + then GO=N + fi + done + if [ $GO = Y ] + then # This assumes that all the processes will + echo sleep 2 > $DIR/fifo & + exit 0 + fi + msleep 250 +done diff --git a/performance/lmbench3/scripts/target b/performance/lmbench3/scripts/target new file mode 100755 index 0000000..77eee07 --- /dev/null +++ b/performance/lmbench3/scripts/target @@ -0,0 +1,24 @@ +#!/bin/sh + +# Figure out the OS name if possible. +# +# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx). +# Copyright (c) 1994 Larry McVoy. GPLed software. +# $Id: target 1.3 00/01/31 15:29:43-08:00 lm@xxxxxxxxxxxxxxx $ +case `uname -s` in + *HP-UX*) echo hpux;; + *Linux*) echo linux;; + *IRIX*) echo irix;; + *AIX*) echo aix;; + BSD/OS) echo bsdi;; + *BSD*) echo bsd;; + *OSF1*) echo osf1;; + *ULTRIX*) echo ultrix;; + *SunOS*) case `uname -r` in + 4*) echo sunos;; + 5*) echo solaris;; + *) echo unknown;; + esac;; + *) echo unknown;; +esac +exit 0 diff --git a/performance/lmbench3/scripts/version b/performance/lmbench3/scripts/version new file mode 100755 index 0000000..879b700 --- /dev/null +++ b/performance/lmbench3/scripts/version @@ -0,0 +1,25 @@ +#!/bin/sh + +# %W% %@% + +F="no_such_file" +if [ -f version.h ] +then F=version.h +else if [ -f ../src/version.h ] + then F=../src/version.h + else if [ -f src/version.h ] + then F=src/version.h + fi + fi +fi +if [ -f $F ] +then VERS=`egrep 'MAJOR|MINOR' $F | awk '{print $3}'` + set `echo $VERS` + if [ $2 -lt 0 ] + then VERS=`echo $1$2 | sed s/-/alpha/` + else VERS=`echo $VERS |sed 's/ /./'` + fi + VERS=lmbench-$VERS +else VERS=lmench-2-something +fi +echo $VERS diff --git a/performance/lmbench3/scripts/xroff b/performance/lmbench3/scripts/xroff new file mode 100755 index 0000000..d5acf20 --- /dev/null +++ b/performance/lmbench3/scripts/xroff @@ -0,0 +1,5 @@ +#!/bin/sh + +# X previewer like groff/nroff scripts. +groff -P -filename -P "| groff -Z -X -Tps $*" -X -Tps "$@" +exit 0 diff --git a/performance/lmbench3/src/Makefile b/performance/lmbench3/src/Makefile new file mode 100644 index 0000000..089dcb9 --- /dev/null +++ b/performance/lmbench3/src/Makefile @@ -0,0 +1,506 @@ +# $Id$ + +# Make targets: +# +# lmbench [default] builds the benchmark suite for the current os/arch +# results builds, configures run parameters, and runs the benchmark +# rerun reruns the benchmark using the same parameters as last time +# scaling reruns the benchmark using same parameters as last time, +# except it asks what scaling value to use +# hardware reruns the hardware benchmarks using the same parameters +# os reruns the OS benchmarks using the same parameters +# clean cleans out sources and run configuration +# clobber clean and removes the bin directories +# shar obsolete, use cd .. && make shar +# depend builds make dependencies (needs gcc) +# debug builds all the benchmarks with '-g' debugging flag +# assembler builds the .s files for each benchmark +# +# This is largely self configuring. Most stuff is pretty portable. +# +# If you don't have gcc, try make CC=cc and see if that works. +# +# If you want to do cross-compilation try make OS=armv5tel-linux-gnu +# or whatever your OS string should be in the target environment. +# Since many embedded development environments also have a special +# cross-compiler, you might want to also select a particular compiler, +# so your build command would look something like: +# make OS=armv5tel-linux-gnu CC=gcc-arm +# +# Overriding the OS and CC make parameters needs to be done as an +# argument to make, not as an environment variable. See above comments. +# + +# I finally know why Larry Wall's Makefile says "Grrrr". +SHELL=/bin/sh + +CC=`../scripts/compiler` +MAKE=`../scripts/make` +AR=ar +ARCREATE=cr + +# base of installation location +BASE=/usr/local +O= ../bin/unknown +D= ../doc +TRUE=/bin/true +OS=`../scripts/os` +TARGET=`../scripts/target` +BINDIR=../bin/$(OS) +CONFIG=../bin/$(OS)/`../scripts/config` +UTILS=../scripts/target ../scripts/os ../scripts/gnu-os ../scripts/compiler \ + ../scripts/info ../scripts/info-template ../scripts/version \ + ../scripts/config ../scripts/config-run ../scripts/results \ + ../scripts/lmbench ../scripts/make ../scripts/build +INSTALL=cp +RESULTS=Results/$(OS) +SAMPLES=lmbench/Results/aix/rs6000 lmbench/Results/hpux/snake \ + lmbench/Results/irix/indigo2 lmbench/Results/linux/pentium \ + lmbench/Results/osf1/alpha lmbench/Results/solaris/ss20* + +COMPILE=$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) + +INCS = bench.h lib_mem.h lib_tcp.h lib_udp.h stats.h timing.h + +SRCS = bw_file_rd.c bw_mem.c bw_mmap_rd.c bw_pipe.c bw_tcp.c bw_udp.c \ + bw_unix.c \ + cache.c clock.c disk.c enough.c flushdisk.c getopt.c hello.c \ + lat_connect.c lat_ctx.c lat_fcntl.c lat_fifo.c lat_fs.c \ + lat_mem_rd.c lat_mmap.c lat_ops.c lat_pagefault.c lat_pipe.c \ + lat_proc.c lat_rpc.c lat_select.c lat_sig.c lat_syscall.c \ + lat_tcp.c lat_udp.c lat_unix.c lat_unix_connect.c lat_sem.c \ + lat_usleep.c lat_pmake.c \ + lib_debug.c lib_mem.c lib_stats.c lib_tcp.c lib_timing.c \ + lib_udp.c lib_unix.c lib_sched.c \ + line.c lmdd.c lmhttp.c par_mem.c par_ops.c loop_o.c memsize.c \ + mhz.c msleep.c rhttp.c seek.c timing_o.c tlb.c stream.c \ + bench.h lib_debug.h lib_tcp.h lib_udp.h lib_unix.h names.h \ + stats.h timing.h version.h + +ASMS = $O/bw_file_rd.s $O/bw_mem.s $O/bw_mmap_rd.s $O/bw_pipe.s \ + $O/bw_tcp.s $O/bw_udp.s $O/bw_unix.s $O/clock.s \ + $O/disk.s $O/enough.s $O/flushdisk.s $O/getopt.s $O/hello.s \ + $O/lat_connect.s $O/lat_ctx.s lat_fcntl.s $O/lat_fifo.s \ + $O/lat_fs.s $O/lat_mem_rd.s $O/lat_mmap.s $O/lat_ops.s \ + $O/lat_pagefault.s $O/lat_pipe.s $O/lat_proc.s $O/lat_rpc.s \ + $O/lat_select.s $O/lat_sig.s $O/lat_syscall.s $O/lat_tcp.s \ + $O/lat_udp.s $O/lat_unix.s $O/lat_unix_connect.s $O/lat_sem.s \ + $O/lib_debug.s $O/lib_mem.s \ + $O/lib_stats.s $O/lib_tcp.s $O/lib_timing.s $O/lib_udp.s \ + $O/lib_unix.s $O/lib_sched.s \ + $O/line.s $O/lmdd.s $O/lmhttp.s $O/par_mem.s \ + $O/par_ops.s $O/loop_o.s $O/memsize.s $O/mhz.s $O/msleep.s \ + $O/rhttp.s $O/timing_o.s $O/tlb.s $O/stream.s \ + $O/cache.s $O/lat_dram_page.s $O/lat_pmake.s $O/lat_rand.s \ + $O/lat_usleep.s $O/lat_cmd.s +EXES = $O/bw_file_rd $O/bw_mem $O/bw_mmap_rd $O/bw_pipe $O/bw_tcp \ + $O/bw_unix $O/hello \ + $O/lat_select $O/lat_pipe $O/lat_rpc $O/lat_syscall $O/lat_tcp \ + $O/lat_udp $O/lat_mmap $O/mhz $O/lat_proc $O/lat_pagefault \ + $O/lat_connect $O/lat_fs $O/lat_sig $O/lat_mem_rd $O/lat_ctx \ + $O/lat_sem \ + $O/memsize $O/lat_unix $O/lmdd $O/timing_o $O/enough \ + $O/msleep $O/loop_o $O/lat_fifo $O/lmhttp $O/lat_http \ + $O/lat_fcntl $O/disk $O/lat_unix_connect $O/flushdisk \ + $O/lat_ops $O/line $O/tlb $O/par_mem $O/par_ops \ + $O/stream +OPT_EXES=$O/cache $O/lat_dram_page $O/lat_pmake $O/lat_rand \ + $O/lat_usleep $O/lat_cmd +LIBOBJS= $O/lib_tcp.o $O/lib_udp.o $O/lib_unix.o $O/lib_timing.o \ + $O/lib_mem.o $O/lib_stats.o $O/lib_debug.o $O/getopt.o \ + $O/lib_sched.o + +lmbench: $(UTILS) + @env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="$(CC)" OS="$(OS)" ../scripts/build all + -@env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="-k $(MAKEFLAGS)" CC="$(CC)" OS="$(OS)" ../scripts/build opt + +results: lmbench + @env OS="${OS}" ../scripts/config-run + @env OS="${OS}" ../scripts/results + +rerun: lmbench + @if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; fi + @env OS="${OS}" ../scripts/results + +scaling: lmbench + @if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; \ + else ../scripts/config-scaling $(CONFIG); fi + @env OS="${OS}" ../scripts/results + +hardware: lmbench + @if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; fi + @env OS="${OS}" BENCHMARK_HARDWARE=YES BENCHMARK_OS=NO ../scripts/results + +os: lmbench + @if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; fi + @env OS="${OS}" BENCHMARK_HARDWARE=NO BENCHMARK_OS=YES ../scripts/results + +install: lmbench + @env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build install-target + +install-target: + if [ ! -d $(BASE) ]; then mkdir $(BASE); fi + if [ ! -d $(BASE)/bin ]; then mkdir $(BASE)/bin; fi + if [ ! -d $(BASE)/include ]; then mkdir $(BASE)/include; fi + if [ ! -d $(BASE)/lib ]; then mkdir $(BASE)/lib; fi + cp $(EXES) $(BASE)/bin + cp $(INCS) $(BASE)/include + cp $O/lmbench.a $(BASE)/lib/libmbench.a + cd ../doc; env MAKEFLAGS="$(MAKEFLAGS)" make CC="${CC}" OS="${OS}" BASE="$(BASE)" install + + +# No special handling for all these +all: $(EXES) $O/lmbench +opt: $(OPT_EXES) +asm: $(ASMS) +$(ASMS): + $(CC) -S $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o $@ `basename $@ .s`.c + +Wall: + @env CFLAGS="-Wall -ansi" MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build all opt + +debug: + @env CFLAGS="-g -O" MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build all opt + +assembler: + @env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build asm + +bk.ver: ../SCCS/s.ChangeSet + rm -f bk.ver + -echo `bk prs -hr+ -d'$$if(:SYMBOL:){:SYMBOL: }:UTC:' ../ChangeSet;` > bk.ver + touch bk.ver + +dist: bk.ver + @if [ "X`cd ..; bk sfiles -c`" != "X" ]; then \ + echo "modified files!"; \ + false; \ + fi + @if [ "X`cd ..; bk pending`" != "X" ]; then \ + echo "pending changes!"; \ + false; \ + fi + cd ..; \ + SRCDIR=`pwd`; \ + DIR=`basename $${SRCDIR}`; \ + VERSION=`cat src/bk.ver| awk '{print $$1;}' | sed -e 's/Version-//g'`; \ + cd ..; \ + bk clone $${DIR} /tmp/lmbench-$${VERSION}; \ + cd /tmp/lmbench-$${VERSION}; \ + bk sfiles | xargs touch; \ + sleep 5; \ + bk get -s; \ + for d in doc results scripts src; do \ + cd $$d; bk get -s; cd ..; \ + done; \ + bk sfiles -U -g | xargs touch; \ + cd src; \ + make bk.ver; \ + cd /tmp; \ + tar czf $${SRCDIR}/../lmbench-$${VERSION}.tgz \ + lmbench-$${VERSION}; \ + rm -rf /tmp/lmbench-$${VERSION}; + +get $(SRCS): + -get -s $(SRCS) + +edit get-e: + get -e -s $(SRCS) + +clean: + /bin/rm -f ../bin/*/CONFIG ../bin/*/*.[oas] + /bin/rm -f *.[oas] + +clobber: + /bin/rm -rf ../bin* SHAR + +shar: + cd ../.. && shar lmbench/Results/Makefile $(SAMPLES) lmbench/scripts/* lmbench/src/Makefile lmbench/src/*.[ch] > lmbench/SHAR + +depend: ../scripts/depend + ../scripts/depend + +testmake: $(SRCS) $(UTILS) # used by scripts/make to test gmake + @true + +.PHONY: lmbench results rerun hardware os install all Wall debug \ + install install-target dist get edit get-e clean clobber \ + share depend testmake + +$O/lmbench : ../scripts/lmbench + rm -f $O/lmbench + sed -e "s/<version>/`cat bk.ver`/g" < ../scripts/lmbench > $O/lmbench + chmod +x $O/lmbench + +$O/lmbench.a: $(LIBOBJS) + /bin/rm -f $O/lmbench.a + $(AR) $(ARCREATE) $O/lmbench.a $(LIBOBJS) + -ranlib $O/lmbench.a + +$O/lib_timing.o : lib_timing.c $(INCS) + $(COMPILE) -c lib_timing.c -o $O/lib_timing.o +$O/lib_mem.o : lib_mem.c $(INCS) + $(COMPILE) -c lib_mem.c -o $O/lib_mem.o +$O/lib_tcp.o : lib_tcp.c $(INCS) + $(COMPILE) -c lib_tcp.c -o $O/lib_tcp.o +$O/lib_udp.o : lib_udp.c $(INCS) + $(COMPILE) -c lib_udp.c -o $O/lib_udp.o +$O/lib_unix.o : lib_unix.c $(INCS) + $(COMPILE) -c lib_unix.c -o $O/lib_unix.o +$O/lib_debug.o : lib_debug.c $(INCS) + $(COMPILE) -c lib_debug.c -o $O/lib_debug.o +$O/lib_stats.o : lib_stats.c $(INCS) + $(COMPILE) -c lib_stats.c -o $O/lib_stats.o +$O/lib_sched.o : lib_sched.c $(INCS) + $(COMPILE) -c lib_sched.c -o $O/lib_sched.o +$O/getopt.o : getopt.c $(INCS) + $(COMPILE) -c getopt.c -o $O/getopt.o + +$(UTILS) : + -cd ../scripts; make get + +# Do not remove the next line, $(MAKE) depend needs it +# MAKEDEPEND follows +$O/rhttp.s:rhttp.c timing.h stats.h bench.h +$O/rhttp: rhttp.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/rhttp rhttp.c $O/lmbench.a $(LDLIBS) + +$O/http.s:http.c timing.h stats.h bench.h +$O/http: http.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/http http.c $O/lmbench.a $(LDLIBS) + +$O/flushdisk.s:flushdisk.c +$O/flushdisk: flushdisk.c + $(COMPILE) -DMAIN -o $O/flushdisk flushdisk.c + +$O/mhz.s: mhz.c timing.h stats.h bench.h +$O/mhz: mhz.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/mhz mhz.c $O/lmbench.a $(LDLIBS) -lm + +$O/lat_ctx.s: lat_ctx.c timing.h stats.h bench.h +$O/lat_ctx: lat_ctx.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_ctx lat_ctx.c $O/lmbench.a $(LDLIBS) + +$O/lmhttp.s:lmhttp.c timing.h stats.h bench.h +$O/lmhttp: lmhttp.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lmhttp lmhttp.c $O/lmbench.a $(LDLIBS) + +$O/lat_http.s:lat_http.c timing.h stats.h bench.h +$O/lat_http: lat_http.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_http lat_http.c $O/lmbench.a $(LDLIBS) + +$O/bw_file_rd.s:bw_file_rd.c timing.h stats.h bench.h +$O/bw_file_rd: bw_file_rd.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/bw_file_rd bw_file_rd.c $O/lmbench.a $(LDLIBS) + +$O/bw_mem.s:bw_mem.c timing.h stats.h bench.h +$O/bw_mem: bw_mem.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/bw_mem bw_mem.c $O/lmbench.a $(LDLIBS) + +$O/bw_mmap_rd.s:bw_mmap_rd.c timing.h stats.h bench.h +$O/bw_mmap_rd: bw_mmap_rd.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/bw_mmap_rd bw_mmap_rd.c $O/lmbench.a $(LDLIBS) + +$O/bw_pipe.s:bw_pipe.c timing.h stats.h bench.h +$O/bw_pipe: bw_pipe.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/bw_pipe bw_pipe.c $O/lmbench.a $(LDLIBS) + +$O/bw_tcp.s:bw_tcp.c bench.h timing.h stats.h lib_tcp.h +$O/bw_tcp: bw_tcp.c bench.h timing.h stats.h lib_tcp.h $O/lmbench.a + $(COMPILE) -o $O/bw_tcp bw_tcp.c $O/lmbench.a $(LDLIBS) + +$O/bw_udp.s:bw_udp.c bench.h timing.h stats.h lib_udp.h +$O/bw_udp: bw_udp.c bench.h timing.h stats.h lib_udp.h $O/lmbench.a + $(COMPILE) -o $O/bw_udp bw_udp.c $O/lmbench.a $(LDLIBS) + +$O/bw_unix.s:bw_unix.c timing.h stats.h bench.h +$O/bw_unix: bw_unix.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/bw_unix bw_unix.c $O/lmbench.a $(LDLIBS) + +$O/disk.s:disk.c flushdisk.c bench.h timing.h stats.h lib_tcp.h +$O/disk: disk.c flushdisk.c bench.h timing.h stats.h lib_tcp.h $O/lmbench.a + $(COMPILE) -o $O/disk disk.c $O/lmbench.a $(LDLIBS) + +$O/clock.s:clock.c timing.h stats.h bench.h +$O/clock: clock.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/clock clock.c $O/lmbench.a $(LDLIBS) + +$O/hello.s:hello.c +$O/hello: hello.c $O/lmbench.a + $(COMPILE) -o $O/hello hello.c $O/lmbench.a $(LDLIBS) + +$O/lat_alarm.s:lat_alarm.c timing.h stats.h bench.h +$O/lat_alarm: lat_alarm.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_alarm lat_alarm.c $O/lmbench.a $(LDLIBS) + +$O/lat_connect.s:lat_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h +$O/lat_connect: lat_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h $O/lmbench.a + $(COMPILE) -o $O/lat_connect lat_connect.c $O/lmbench.a $(LDLIBS) + +$O/lat_unix_connect.s:lat_unix_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h +$O/lat_unix_connect: lat_unix_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h $O/lmbench.a + $(COMPILE) -o $O/lat_unix_connect lat_unix_connect.c $O/lmbench.a $(LDLIBS) + +$O/lat_fs.s:lat_fs.c timing.h stats.h bench.h +$O/lat_fs: lat_fs.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_fs lat_fs.c $O/lmbench.a $(LDLIBS) + +$O/lat_fcntl.s:lat_fcntl.c timing.h stats.h bench.h +$O/lat_fcntl: lat_fcntl.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_fcntl lat_fcntl.c $O/lmbench.a $(LDLIBS) + +$O/lat_mem_rd.s:lat_mem_rd.c timing.h stats.h bench.h +$O/lat_mem_rd: lat_mem_rd.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_mem_rd lat_mem_rd.c $O/lmbench.a $(LDLIBS) + +$O/lat_mem_rd2.s:lat_mem_rd2.c timing.h stats.h bench.h +$O/lat_mem_rd2: lat_mem_rd2.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_mem_rd2 lat_mem_rd2.c $O/lmbench.a $(LDLIBS) + +$O/lat_mem_wr.s:lat_mem_wr.c timing.h stats.h bench.h +$O/lat_mem_wr: lat_mem_wr.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_mem_wr lat_mem_wr.c $O/lmbench.a $(LDLIBS) + +$O/lat_mem_wr2.s:lat_mem_wr2.c timing.h stats.h bench.h +$O/lat_mem_wr2: lat_mem_wr2.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_mem_wr2 lat_mem_wr2.c $O/lmbench.a $(LDLIBS) + +$O/lat_mmap.s:lat_mmap.c timing.h stats.h bench.h +$O/lat_mmap: lat_mmap.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_mmap lat_mmap.c $O/lmbench.a $(LDLIBS) + +$O/lat_mmaprd.s:lat_mmaprd.c timing.h stats.h bench.h +$O/lat_mmaprd: lat_mmaprd.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_mmaprd lat_mmaprd.c $O/lmbench.a $(LDLIBS) + +$O/lat_ops.s:lat_ops.c timing.h stats.h bench.h +$O/lat_ops: lat_ops.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_ops lat_ops.c $O/lmbench.a $(LDLIBS) + +$O/lat_pagefault.s:lat_pagefault.c timing.h stats.h bench.h +$O/lat_pagefault: lat_pagefault.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_pagefault lat_pagefault.c $O/lmbench.a $(LDLIBS) + +$O/lat_pipe.s:lat_pipe.c timing.h stats.h bench.h +$O/lat_pipe: lat_pipe.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_pipe lat_pipe.c $O/lmbench.a $(LDLIBS) + +$O/lat_fifo.s:lat_fifo.c timing.h stats.h bench.h +$O/lat_fifo: lat_fifo.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_fifo lat_fifo.c $O/lmbench.a $(LDLIBS) + +$O/lat_proc.s:lat_proc.c timing.h stats.h bench.h +$O/lat_proc: lat_proc.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_proc lat_proc.c $O/lmbench.a $(LDLIBS) + +$O/lat_rpc.s:lat_rpc.c timing.h stats.h bench.h +$O/lat_rpc: lat_rpc.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_rpc lat_rpc.c $O/lmbench.a $(LDLIBS) + +$O/lat_sig.s:lat_sig.c timing.h stats.h bench.h +$O/lat_sig: lat_sig.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_sig lat_sig.c $O/lmbench.a $(LDLIBS) + +$O/lat_syscall.s:lat_syscall.c timing.h stats.h bench.h +$O/lat_syscall: lat_syscall.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_syscall lat_syscall.c $O/lmbench.a $(LDLIBS) + +$O/lat_select.s: lat_select.c timing.h stats.h bench.h +$O/lat_select: lat_select.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_select lat_select.c $O/lmbench.a $(LDLIBS) + +$O/lat_tcp.s:lat_tcp.c timing.h stats.h bench.h lib_tcp.h +$O/lat_tcp: lat_tcp.c timing.h stats.h bench.h lib_tcp.h $O/lmbench.a + $(COMPILE) -o $O/lat_tcp lat_tcp.c $O/lmbench.a $(LDLIBS) + +$O/lat_udp.s:lat_udp.c timing.h stats.h bench.h lib_udp.h +$O/lat_udp: lat_udp.c timing.h stats.h bench.h lib_udp.h $O/lmbench.a + $(COMPILE) -o $O/lat_udp lat_udp.c $O/lmbench.a $(LDLIBS) + +$O/lat_unix.s:lat_unix.c timing.h stats.h bench.h +$O/lat_unix: lat_unix.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_unix lat_unix.c $O/lmbench.a $(LDLIBS) + +$O/lib_tcp.s:lib_tcp.c bench.h lib_tcp.h +$O/lib_tcp: lib_tcp.c bench.h lib_tcp.h $O/lmbench.a + $(COMPILE) -o $O/lib_tcp lib_tcp.c $O/lmbench.a $(LDLIBS) + +$O/lib_udp.s:lib_udp.c bench.h lib_udp.h +$O/lib_udp: lib_udp.c bench.h lib_udp.h $O/lmbench.a + $(COMPILE) -o $O/lib_udp lib_udp.c $O/lmbench.a $(LDLIBS) + +$O/lmdd.s:lmdd.c timing.h stats.h bench.h +$O/lmdd: lmdd.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lmdd lmdd.c $O/lmbench.a $(LDLIBS) + +$O/enough.s:enough.c timing.h stats.h bench.h +$O/enough: enough.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/enough enough.c $O/lmbench.a $(LDLIBS) + +$O/loop_o.s:loop_o.c timing.h stats.h bench.h +$O/loop_o: loop_o.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/loop_o loop_o.c $O/lmbench.a $(LDLIBS) + +$O/timing_o.s:timing_o.c timing.h stats.h bench.h +$O/timing_o: timing_o.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/timing_o timing_o.c $O/lmbench.a $(LDLIBS) + +$O/memsize.s:memsize.c timing.h stats.h bench.h +$O/memsize: memsize.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/memsize memsize.c $O/lmbench.a $(LDLIBS) + +$O/msleep.s:msleep.c timing.h stats.h bench.h +$O/msleep: msleep.c timing.h stats.h bench.h + $(COMPILE) -o $O/msleep msleep.c + +$O/line.s: line.c timing.h stats.h bench.h +$O/line: line.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/line line.c $O/lmbench.a $(LDLIBS) + +$O/tlb.s:tlb.c timing.h stats.h bench.h +$O/tlb: tlb.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/tlb tlb.c $O/lmbench.a $(LDLIBS) + +$O/cache.s:cache.c timing.h stats.h bench.h +$O/cache: cache.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/cache cache.c $O/lmbench.a $(LDLIBS) + +$O/par_mem.s:par_mem.c timing.h stats.h bench.h +$O/par_mem: par_mem.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/par_mem par_mem.c $O/lmbench.a $(LDLIBS) + +$O/par_ops.s:par_ops.c timing.h stats.h bench.h +$O/par_ops: par_ops.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/par_ops par_ops.c $O/lmbench.a $(LDLIBS) + +$O/stream.s:stream.c timing.h stats.h bench.h +$O/stream: stream.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/stream stream.c $O/lmbench.a $(LDLIBS) + +$O/lat_sem.s:lat_sem.c timing.h stats.h bench.h +$O/lat_sem: lat_sem.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_sem lat_sem.c $O/lmbench.a $(LDLIBS) + +$O/par_list.s:par_list.c timing.h stats.h bench.h +$O/par_list: par_list.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/par_list par_list.c $O/lmbench.a $(LDLIBS) + +$O/lat_dram_page.s:lat_dram_page.c timing.h stats.h bench.h +$O/lat_dram_page: lat_dram_page.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_dram_page lat_dram_page.c $O/lmbench.a $(LDLIBS) + +$O/lat_usleep.s:lat_usleep.c timing.h stats.h bench.h +$O/lat_usleep: lat_usleep.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_usleep lat_usleep.c $O/lmbench.a $(LDLIBS) + +$O/lat_pmake.s:lat_pmake.c timing.h stats.h bench.h +$O/lat_pmake: lat_pmake.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_pmake lat_pmake.c $O/lmbench.a $(LDLIBS) + +$O/lat_rand.s:lat_rand.c timing.h stats.h bench.h +$O/lat_rand: lat_rand.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_rand lat_rand.c $O/lmbench.a $(LDLIBS) + +$O/lat_cmd.s:lat_cmd.c timing.h stats.h bench.h +$O/lat_cmd: lat_cmd.c timing.h stats.h bench.h $O/lmbench.a + $(COMPILE) -o $O/lat_cmd lat_cmd.c $O/lmbench.a $(LDLIBS) + diff --git a/performance/lmbench3/src/TODO b/performance/lmbench3/src/TODO new file mode 100644 index 0000000..47c3ff2 --- /dev/null +++ b/performance/lmbench3/src/TODO @@ -0,0 +1,107 @@ +$Id$ + +Add standard deviation and other statistics calculations to "make stats" +in results. Alternatively, we might report min, 1Q, median, 3Q, max, +as standard deviation for non-normal distributions isn't always sensible. + +Add flags to various file-related benchmarks bw_file_rd, bw_mmap_rd, +lat_fcntl.c, lat_fs, lat_mmap, and lat_pagefault, for parallelism +which selects whether each instance has its own file or shares a +file. + +Figure out how to improve lat_select. It doesn't really work for +multi-processor systems. Linus suggests that we have each process +do some amount of work, and vary the amount of work until context +switch times for the producer degrade. The current architecture +of lat_select is too synchronous and favors simple hand-off +scheduling too much. From Linus. + +Look into threads vs. process scaling. benchmp currently uses +separate processes (via fork()); some benchmarks such as page +faults and VM mapping might have very different performance +for threads vs. processes since Linux (at least) has per-memory +space locks for many of these things. From Linus. + +Add a '-f' option to lat_ctx which causes the work to be floating point +summation (so we get floating point state too). (Suggestion by Ingo Molnar) + +Add a threads benchmark suite (context switch, mutex, semaphore, ...). + +Create a new process for each measurement, rather than reusing the same +process. This is mostly to get different page layouts and mostly impacts +the memory latency benchmarks, although it can also affect lat_ctx. + +Write/extend the results processing system/scripts to graph/display/ +process results in the "-P <parallelism>" dimension, and to properly +handle results with differing parallelism when reporting standard +results. The parallelism is stored in the results file as SYNC_MAX. + +Add "bw_udp" benchmark to measure UDP bandwidth +[in progress] + +Make a bw_tcp mode that measures bandwidth for each block and graph that +as offset/bandwidth. + +Make the disk stuff autosize such that you get the same number of data +points regardless of disk size. + +Fix the getsummary to include simple calls. + +Think about the issues of int/long/long long/double/long double +load/stores. Maybe do them all. This will (at least) require +adding a test to scripts/build for the presence of long double +on this system. + +Make all results print out bandwidths in powers of 10/sizes in powers of two. + +Documentation on porting. + +Check that file size is right in the benchmarking system. + +Compiler version info included in results. XXX - do this! + +memory store latency (complex) + Why can't I just use the read one and make it write? + Well, because the read one is list oriented and I need to figure + out reasonable math for the write case. The read one is a load + per statement whereas the write one will be more work, I think. + +RPC numbers reserved for the benchmark. + +Check all the error outputs and make sure they are consistent. + +On all the normalized graphs, make sure that they mean the same thing. +I do not think that the bandwidth measurements are "correct" in this +sense. + +Document the timing.c interfaces. + +Run the whole suite through gcc -Wall and fix all the errors. Also make +sure that it compiles and has the right sizes for 64 bit OS. + +[Mon Jul 1 13:30:01 PDT 1996, after meeting w/ Kevin] + +Do the load latency like so + + loop: + load r1 + { + increase the number of nops until they start to make the + run time longer - the last one was the memory latency. + } + use the register + { + increase the number of nops until they start to make the + run time longer - the last one was the cache fill shadow. + } + repeat + +Do the same thing w/ a varying number of loads (& their uses), showing +the number of outstanding loads implemented to L1, L2, mem. + +Do hand made assembler to get accurate numbers. Provide C source that +mimics the hand made assembler for new machines. + +Think about a report format for the hardware stuff that showed the +numbers as triples L1/L2/mem (or quadruples for alphas). + diff --git a/performance/lmbench3/src/bench.h b/performance/lmbench3/src/bench.h new file mode 100644 index 0000000..8166408 --- /dev/null +++ b/performance/lmbench3/src/bench.h @@ -0,0 +1,323 @@ +/* + * $Id$ + */ +#ifndef _BENCH_H +#define _BENCH_H + +#ifdef WIN32 +#include <windows.h> +typedef unsigned char bool_t; +#endif + +#include <assert.h> +#include <ctype.h> +#include <stdio.h> +#ifndef WIN32 +#include <unistd.h> +#endif +#include <stdlib.h> +#include <fcntl.h> +#include <signal.h> +#include <errno.h> +#ifndef WIN32 +#include <strings.h> +#endif +#include <sys/types.h> +#ifndef WIN32 +#include <sys/mman.h> +#endif +#include <sys/stat.h> +#ifndef WIN32 +#include <sys/wait.h> +#include <time.h> +#include <sys/time.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/resource.h> +#define PORTMAP +#include <rpc/rpc.h> +#endif +#include <rpc/types.h> + +#include <stdarg.h> +#ifndef HAVE_uint +typedef unsigned int uint; +#endif + +#ifndef HAVE_uint64 +#ifdef HAVE_uint64_t +typedef uint64_t uint64; +#else /* HAVE_uint64_t */ +typedef unsigned long long uint64; +#endif /* HAVE_uint64_t */ +#endif /* HAVE_uint64 */ + +#ifndef HAVE_int64 +#ifdef HAVE_int64_t +typedef int64_t int64; +#else /* HAVE_int64_t */ +typedef long long int64; +#endif /* HAVE_int64_t */ +#endif /* HAVE_int64 */ + +#define NO_PORTMAPPER + +#include "stats.h" +#include "timing.h" +#include "lib_debug.h" +#include "lib_tcp.h" +#include "lib_udp.h" +#include "lib_unix.h" + + +#ifdef DEBUG +# define debug(x) fprintf x +#else +# define debug(x) +#endif +#ifdef NO_PORTMAPPER +#define TCP_SELECT -31233 +#define TCP_XACT -31234 +#define TCP_CONTROL -31235 +#define TCP_DATA -31236 +#define TCP_CONNECT -31237 +#define UDP_XACT -31238 +#define UDP_DATA -31239 +#else +#define TCP_SELECT (u_long)404038 /* XXX - unregistered */ +#define TCP_XACT (u_long)404039 /* XXX - unregistered */ +#define TCP_CONTROL (u_long)404040 /* XXX - unregistered */ +#define TCP_DATA (u_long)404041 /* XXX - unregistered */ +#define TCP_CONNECT (u_long)404042 /* XXX - unregistered */ +#define UDP_XACT (u_long)404032 /* XXX - unregistered */ +#define UDP_DATA (u_long)404033 /* XXX - unregistered */ +#define VERS (u_long)1 +#endif + +#define UNIX_CONTROL "/tmp/lmbench.ctl" +#define UNIX_DATA "/tmp/lmbench.data" +#define UNIX_LAT "/tmp/lmbench.lat" + +/* + * socket send/recv buffer optimizations + */ +#define SOCKOPT_READ 0x0001 +#define SOCKOPT_WRITE 0x0002 +#define SOCKOPT_RDWR 0x0003 +#define SOCKOPT_PID 0x0004 +#define SOCKOPT_REUSE 0x0008 +#define SOCKOPT_NONE 0 + +#ifndef SOCKBUF +#define SOCKBUF (1024*1024) +#endif + +#ifndef XFERSIZE +#define XFERSIZE (64*1024) /* all bandwidth I/O should use this */ +#endif + +#if defined(SYS5) || defined(WIN32) +#define bzero(b, len) memset(b, 0, len) +#define bcopy(s, d, l) memcpy(d, s, l) +#define rindex(s, c) strrchr(s, c) +#endif +#define gettime usecs_spent +#define streq !strcmp +#define ulong unsigned long + +#ifndef HAVE_DRAND48 +#ifdef HAVE_RAND +#define srand48 srand +#define drand48() ((double)rand() / (double)RAND_MAX) +#elif defined(HAVE_RANDOM) +#define srand48 srandom +#define drand48() ((double)random() / (double)RAND_MAX) +#endif /* HAVE_RAND */ +#endif /* HAVE_DRAND48 */ + +#ifdef WIN32 +#include <process.h> +#define getpid _getpid +int gettimeofday(struct timeval *tv, struct timezone *tz); +#endif + +#define SMALLEST_LINE 32 /* smallest cache line size */ +#define TIME_OPEN2CLOSE + +#define GO_AWAY signal(SIGALRM, exit); alarm(60 * 60); +#define REAL_SHORT 50000 +#define SHORT 1000000 +#define MEDIUM 2000000 +#define LONGER 7500000 /* for networking data transfers */ +#define ENOUGH REAL_SHORT + +#define TRIES 11 + +typedef struct { + uint64 u; + uint64 n; +} value_t; + +typedef struct { + int N; + value_t v[TRIES]; +} result_t; +int sizeof_result(int N); +void insertinit(result_t *r); +void insertsort(uint64, uint64, result_t *); +void save_median(); +void save_minimum(); +void set_results(result_t *r); +result_t* get_results(); + + +#define BENCHO(loop_body, overhead_body, enough) { \ + int __i, __N; \ + double __oh; \ + result_t __overhead, __r; \ + insertinit(&__overhead); insertinit(&__r); \ + __N = (enough == 0 || get_enough(enough) <= 100000) ? TRIES : 1;\ + if (enough < LONGER) {loop_body;} /* warm the cache */ \ + for (__i = 0; __i < __N; ++__i) { \ + BENCH1(overhead_body, enough); \ + if (gettime() > 0) \ + insertsort(gettime(), get_n(), &__overhead); \ + BENCH1(loop_body, enough); \ + if (gettime() > 0) \ + insertsort(gettime(), get_n(), &__r); \ + } \ + for (__i = 0; __i < __r.N; ++__i) { \ + __oh = __overhead.v[__i].u / (double)__overhead.v[__i].n; \ + if (__r.v[__i].u > (uint64)((double)__r.v[__i].n * __oh)) \ + __r.v[__i].u -= (uint64)((double)__r.v[__i].n * __oh); \ + else \ + __r.v[__i].u = 0; \ + } \ + *(get_results()) = __r; \ +} + +#define BENCH(loop_body, enough) { \ + long __i, __N; \ + result_t __r; \ + insertinit(&__r); \ + __N = (enough == 0 || get_enough(enough) <= 100000) ? TRIES : 1;\ + if (enough < LONGER) {loop_body;} /* warm the cache */ \ + for (__i = 0; __i < __N; ++__i) { \ + BENCH1(loop_body, enough); \ + if (gettime() > 0) \ + insertsort(gettime(), get_n(), &__r); \ + } \ + *(get_results()) = __r; \ +} + +#define BENCH1(loop_body, enough) { \ + double __usecs; \ + BENCH_INNER(loop_body, enough); \ + __usecs = gettime(); \ + __usecs -= t_overhead() + get_n() * l_overhead(); \ + settime(__usecs >= 0. ? (uint64)__usecs : 0); \ +} + +#define BENCH_INNER(loop_body, enough) { \ + static iter_t __iterations = 1; \ + int __enough = get_enough(enough); \ + iter_t __n; \ + double __result = 0.; \ + \ + while(__result < 0.95 * __enough) { \ + start(0); \ + for (__n = __iterations; __n > 0; __n--) { \ + loop_body; \ + } \ + __result = stop(0,0); \ + if (__result < 0.99 * __enough \ + || __result > 1.2 * __enough) { \ + if (__result > 150.) { \ + double tmp = __iterations / __result; \ + tmp *= 1.1 * __enough; \ + __iterations = (iter_t)(tmp + 1); \ + } else { \ + if (__iterations > (iter_t)1<<27) { \ + __result = 0.; \ + break; \ + } \ + __iterations <<= 3; \ + } \ + } \ + } /* while */ \ + save_n((uint64)__iterations); settime((uint64)__result); \ +} + +/* getopt stuff */ +#define getopt mygetopt +#define optind myoptind +#define optarg myoptarg +#define opterr myopterr +#define optopt myoptopt +extern int optind; +extern int opterr; +extern int optopt; +extern char *optarg; +int getopt(int ac, char **av, char *opts); + +typedef u_long iter_t; +typedef void (*benchmp_f)(iter_t iterations, void* cookie); + +extern void benchmp(benchmp_f initialize, + benchmp_f benchmark, + benchmp_f cleanup, + int enough, + int parallel, + int warmup, + int repetitions, + void* cookie + ); + +/* + * These are used by weird benchmarks which cannot return, such as page + * protection fault handling. See lat_sig.c for sample usage. + */ +extern void* benchmp_getstate(); +extern iter_t benchmp_interval(void* _state); + +/* + * Which child process is this? + * Returns a number in the range [0, ..., N-1], where N is the + * total number of children (parallelism) + */ +extern int benchmp_childid(); + +/* + * harvest dead children to prevent zombies + */ +extern void sigchld_wait_handler(int signo); + +/* + * Handle optional pinning/placement of processes on an SMP machine. + */ +extern int handle_scheduler(int childno, int benchproc, int nbenchprocs); + +#include "lib_mem.h" + +/* + * Generated from msg.x which is included here: + + program XACT_PROG { + version XACT_VERS { + char + RPC_XACT(char) = 1; + } = 1; + } = 3970; + + * Please do not edit this file. + * It was generated using rpcgen. + */ + +#define XACT_PROG ((u_long)404040) +#define XACT_VERS ((u_long)1) +#define RPC_XACT ((u_long)1) +#define RPC_EXIT ((u_long)2) +extern char *rpc_xact_1(); +extern char *client_rpc_xact_1(); + +#endif /* _BENCH_H */ diff --git a/performance/lmbench3/src/bk.ver b/performance/lmbench3/src/bk.ver new file mode 100644 index 0000000..00750ed --- /dev/null +++ b/performance/lmbench3/src/bk.ver @@ -0,0 +1 @@ +3 diff --git a/performance/lmbench3/src/busy.c b/performance/lmbench3/src/busy.c new file mode 100644 index 0000000..ab117ba --- /dev/null +++ b/performance/lmbench3/src/busy.c @@ -0,0 +1,10 @@ +volatile int i; + +main() +{ + + nice(10); + for (;;) getppid(); + //for (;;) i++; + exit(i); +} diff --git a/performance/lmbench3/src/bw_file_rd.c b/performance/lmbench3/src/bw_file_rd.c new file mode 100644 index 0000000..61583c6 --- /dev/null +++ b/performance/lmbench3/src/bw_file_rd.c @@ -0,0 +1,192 @@ +/* + * bw_file_rd.c - time reading & summing of a file + * + * Usage: bw_file_rd [-C] [-P <parallelism] [-W <warmup>] [-N <repetitions>] size file + * + * The intent is that the file is in memory. + * Disk benchmarking is done with lmdd. + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +#define CHK(x) if ((int)(x) == -1) { perror(#x); exit(1); } +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#define TYPE int +#define MINSZ (sizeof(TYPE) * 128) + +void *buf; /* do the I/O here */ +size_t xfersize; /* do it in units of this */ +size_t count; /* bytes to move (can't be modified) */ + +typedef struct _state { + char filename[256]; + int fd; + int clone; +} state_t; + +void doit(int fd) +{ + int sum = 0; + size_t size, chunk; + + size = count; + chunk = xfersize; + while (size >= 0) { + if (size < chunk) chunk = size; + if (read(fd, buf, MIN(size, chunk)) <= 0) { + break; + } + bread(buf, MIN(size, xfersize)); + size -= chunk; + } +} + +void +initialize(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + state->fd = -1; + if (state->clone) { + char buf[128]; + char* s; + + /* copy original file into a process-specific one */ + sprintf(buf, "%d", (int)getpid()); + s = (char*)malloc(strlen(state->filename) + strlen(buf) + 1); + sprintf(s, "%s%d", state->filename, (int)getpid()); + if (cp(state->filename, s, S_IREAD|S_IWRITE) < 0) { + perror("creating private tempfile"); + unlink(s); + exit(1); + } + strcpy(state->filename, s); + } +} + +void +init_open(iter_t iterations, void * cookie) +{ + state_t *state = (state_t *) cookie; + int ofd; + + if (iterations) return; + + initialize(0, cookie); + CHK(ofd = open(state->filename, O_RDONLY)); + state->fd = ofd; +} + +void +time_with_open(iter_t iterations, void * cookie) +{ + state_t *state = (state_t *) cookie; + char *filename = state->filename; + int fd; + + while (iterations-- > 0) { + fd= open(filename, O_RDONLY); + doit(fd); + close(fd); + } +} + +void +time_io_only(iter_t iterations,void * cookie) +{ + state_t *state = (state_t *) cookie; + int fd = state->fd; + + while (iterations-- > 0) { + lseek(fd, 0, 0); + doit(fd); + } +} + +void +cleanup(iter_t iterations, void * cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + if (state->fd >= 0) close(state->fd); + if (state->clone) unlink(state->filename); +} + +int +main(int ac, char **av) +{ + int fd; + state_t state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char usage[1024]; + + sprintf(usage,"[-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] <size> open2close|io_only <filename>" + "\nmin size=%d\n",(int) (XFERSIZE>>10)) ; + + state.clone = 0; + + while (( c = getopt(ac, av, "P:W:N:C")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + case 'C': + state.clone = 1; + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind + 3 != ac) { /* should have three arguments left */ + lmbench_usage(ac, av, usage); + } + + strcpy(state.filename,av[optind+2]); + count = bytes(av[optind]); + if (count < MINSZ) { + exit(1); /* I want this to be quiet */ + } + if (count < XFERSIZE) { + xfersize = count; + } else { + xfersize = XFERSIZE; + } + buf = (void *)valloc(XFERSIZE); + bzero(buf, XFERSIZE); + + if (!strcmp("open2close", av[optind+1])) { + benchmp(initialize, time_with_open, cleanup, + 0, parallel, warmup, repetitions, &state); + } else if (!strcmp("io_only", av[optind+1])) { + benchmp(init_open, time_io_only, cleanup, + 0, parallel, warmup, repetitions, &state); + } else lmbench_usage(ac, av, usage); + bandwidth(count, get_n() * parallel, 0); + return (0); +} diff --git a/performance/lmbench3/src/bw_mem.c b/performance/lmbench3/src/bw_mem.c new file mode 100644 index 0000000..19583cf --- /dev/null +++ b/performance/lmbench3/src/bw_mem.c @@ -0,0 +1,468 @@ +/* + * bw_mem.c - simple memory write bandwidth benchmark + * + * Usage: bw_mem [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size what + * what: rd wr rdwr cp fwr frd fcp bzero bcopy + * + * Copyright (c) 1994-1996 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$"; + +#include "bench.h" + +#define TYPE int + +/* + * rd - 4 byte read, 32 byte stride + * wr - 4 byte write, 32 byte stride + * rdwr - 4 byte read followed by 4 byte write to same place, 32 byte stride + * cp - 4 byte read then 4 byte write to different place, 32 byte stride + * fwr - write every 4 byte word + * frd - read every 4 byte word + * fcp - copy every 4 byte word + * + * All tests do 512 byte chunks in a loop. + * + * XXX - do a 64bit version of this. + */ +void rd(iter_t iterations, void *cookie); +void wr(iter_t iterations, void *cookie); +void rdwr(iter_t iterations, void *cookie); +void mcp(iter_t iterations, void *cookie); +void fwr(iter_t iterations, void *cookie); +void frd(iter_t iterations, void *cookie); +void fcp(iter_t iterations, void *cookie); +void loop_bzero(iter_t iterations, void *cookie); +void loop_bcopy(iter_t iterations, void *cookie); +void init_overhead(iter_t iterations, void *cookie); +void init_loop(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); + +typedef struct _state { + double overhead; + size_t nbytes; + int need_buf2; + int aligned; + TYPE *buf; + TYPE *buf2; + TYPE *buf2_orig; + TYPE *lastone; + size_t N; +} state_t; + +void adjusted_bandwidth(uint64 t, uint64 b, uint64 iter, double ovrhd); + +int +main(int ac, char **av) +{ + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + size_t nbytes; + state_t state; + int c; + char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] <size> what [conflict]\nwhat: rd wr rdwr cp fwr frd fcp bzero bcopy\n<size> must be larger than 512"; + + state.overhead = 0; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + /* should have two, possibly three [indicates align] arguments left */ + state.aligned = state.need_buf2 = 0; + if (optind + 3 == ac) { + state.aligned = 1; + } else if (optind + 2 != ac) { + lmbench_usage(ac, av, usage); + } + + nbytes = state.nbytes = bytes(av[optind]); + if (state.nbytes < 512) { /* this is the number of bytes in the loop */ + lmbench_usage(ac, av, usage); + } + + if (streq(av[optind+1], "cp") || + streq(av[optind+1], "fcp") || streq(av[optind+1], "bcopy")) { + state.need_buf2 = 1; + } + + if (streq(av[optind+1], "rd")) { + benchmp(init_loop, rd, cleanup, 0, parallel, + warmup, repetitions, &state); + } else if (streq(av[optind+1], "wr")) { + benchmp(init_loop, wr, cleanup, 0, parallel, + warmup, repetitions, &state); + } else if (streq(av[optind+1], "rdwr")) { + benchmp(init_loop, rdwr, cleanup, 0, parallel, + warmup, repetitions, &state); + } else if (streq(av[optind+1], "cp")) { + benchmp(init_loop, mcp, cleanup, 0, parallel, + warmup, repetitions, &state); + } else if (streq(av[optind+1], "frd")) { + benchmp(init_loop, frd, cleanup, 0, parallel, + warmup, repetitions, &state); + } else if (streq(av[optind+1], "fwr")) { + benchmp(init_loop, fwr, cleanup, 0, parallel, + warmup, repetitions, &state); + } else if (streq(av[optind+1], "fcp")) { + benchmp(init_loop, fcp, cleanup, 0, parallel, + warmup, repetitions, &state); + } else if (streq(av[optind+1], "bzero")) { + benchmp(init_loop, loop_bzero, cleanup, 0, parallel, + warmup, repetitions, &state); + } else if (streq(av[optind+1], "bcopy")) { + benchmp(init_loop, loop_bcopy, cleanup, 0, parallel, + warmup, repetitions, &state); + } else { + lmbench_usage(ac, av, usage); + } + adjusted_bandwidth(gettime(), nbytes, + get_n() * parallel, state.overhead); + return(0); +} + +void +init_overhead(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; +} + +void +init_loop(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + state->buf = (TYPE *)valloc(state->nbytes); + state->buf2_orig = NULL; + state->lastone = (TYPE*)state->buf - 1; + state->lastone = (TYPE*)((char *)state->buf + state->nbytes - 512); + state->N = state->nbytes; + + if (!state->buf) { + perror("malloc"); + exit(1); + } + bzero((void*)state->buf, state->nbytes); + + if (state->need_buf2 == 1) { + state->buf2_orig = state->buf2 = (TYPE *)valloc(state->nbytes + 2048); + if (!state->buf2) { + perror("malloc"); + exit(1); + } + + /* default is to have stuff unaligned wrt each other */ + /* XXX - this is not well tested or thought out */ + if (state->aligned) { + char *tmp = (char *)state->buf2; + + tmp += 2048 - 128; + state->buf2 = (TYPE *)tmp; + } + } +} + +void +cleanup(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + free(state->buf); + if (state->buf2_orig) free(state->buf2_orig); +} + +void +rd(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register TYPE *lastone = state->lastone; + register int sum = 0; + + while (iterations-- > 0) { + register TYPE *p = state->buf; + while (p <= lastone) { + sum += +#define DOIT(i) p[i]+ + DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24) + DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52) + DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76) + DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100) + DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) + p[124]; + p += 128; + } + } + use_int(sum); +} +#undef DOIT + +void +wr(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register TYPE *lastone = state->lastone; + + while (iterations-- > 0) { + register TYPE *p = state->buf; + while (p <= lastone) { +#define DOIT(i) p[i] = 1; + DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24) + DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52) + DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76) + DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100) + DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124); + p += 128; + } + } +} +#undef DOIT + +void +rdwr(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register TYPE *lastone = state->lastone; + register int sum = 0; + + while (iterations-- > 0) { + register TYPE *p = state->buf; + while (p <= lastone) { +#define DOIT(i) sum += p[i]; p[i] = 1; + DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24) + DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52) + DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76) + DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100) + DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124); + p += 128; + } + } + use_int(sum); +} +#undef DOIT + +void +mcp(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register TYPE *lastone = state->lastone; + TYPE* p_save = NULL; + + while (iterations-- > 0) { + register TYPE *p = state->buf; + register TYPE *dst = state->buf2; + while (p <= lastone) { +#define DOIT(i) dst[i] = p[i]; + DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24) + DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52) + DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76) + DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100) + DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124); + p += 128; + dst += 128; + } + p_save = p; + } + use_pointer(p_save); +} +#undef DOIT + +void +fwr(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register TYPE *lastone = state->lastone; + TYPE* p_save = NULL; + + while (iterations-- > 0) { + register TYPE *p = state->buf; + while (p <= lastone) { +#define DOIT(i) p[i]= + DOIT(0) DOIT(1) DOIT(2) DOIT(3) DOIT(4) DOIT(5) DOIT(6) + DOIT(7) DOIT(8) DOIT(9) DOIT(10) DOIT(11) DOIT(12) + DOIT(13) DOIT(14) DOIT(15) DOIT(16) DOIT(17) DOIT(18) + DOIT(19) DOIT(20) DOIT(21) DOIT(22) DOIT(23) DOIT(24) + DOIT(25) DOIT(26) DOIT(27) DOIT(28) DOIT(29) DOIT(30) + DOIT(31) DOIT(32) DOIT(33) DOIT(34) DOIT(35) DOIT(36) + DOIT(37) DOIT(38) DOIT(39) DOIT(40) DOIT(41) DOIT(42) + DOIT(43) DOIT(44) DOIT(45) DOIT(46) DOIT(47) DOIT(48) + DOIT(49) DOIT(50) DOIT(51) DOIT(52) DOIT(53) DOIT(54) + DOIT(55) DOIT(56) DOIT(57) DOIT(58) DOIT(59) DOIT(60) + DOIT(61) DOIT(62) DOIT(63) DOIT(64) DOIT(65) DOIT(66) + DOIT(67) DOIT(68) DOIT(69) DOIT(70) DOIT(71) DOIT(72) + DOIT(73) DOIT(74) DOIT(75) DOIT(76) DOIT(77) DOIT(78) + DOIT(79) DOIT(80) DOIT(81) DOIT(82) DOIT(83) DOIT(84) + DOIT(85) DOIT(86) DOIT(87) DOIT(88) DOIT(89) DOIT(90) + DOIT(91) DOIT(92) DOIT(93) DOIT(94) DOIT(95) DOIT(96) + DOIT(97) DOIT(98) DOIT(99) DOIT(100) DOIT(101) DOIT(102) + DOIT(103) DOIT(104) DOIT(105) DOIT(106) DOIT(107) + DOIT(108) DOIT(109) DOIT(110) DOIT(111) DOIT(112) + DOIT(113) DOIT(114) DOIT(115) DOIT(116) DOIT(117) + DOIT(118) DOIT(119) DOIT(120) DOIT(121) DOIT(122) + DOIT(123) DOIT(124) DOIT(125) DOIT(126) DOIT(127) 1; + p += 128; + } + p_save = p; + } + use_pointer(p_save); +} +#undef DOIT + +void +frd(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register int sum = 0; + register TYPE *lastone = state->lastone; + + while (iterations-- > 0) { + register TYPE *p = state->buf; + while (p <= lastone) { + sum += +#define DOIT(i) p[i]+ + DOIT(0) DOIT(1) DOIT(2) DOIT(3) DOIT(4) DOIT(5) DOIT(6) + DOIT(7) DOIT(8) DOIT(9) DOIT(10) DOIT(11) DOIT(12) + DOIT(13) DOIT(14) DOIT(15) DOIT(16) DOIT(17) DOIT(18) + DOIT(19) DOIT(20) DOIT(21) DOIT(22) DOIT(23) DOIT(24) + DOIT(25) DOIT(26) DOIT(27) DOIT(28) DOIT(29) DOIT(30) + DOIT(31) DOIT(32) DOIT(33) DOIT(34) DOIT(35) DOIT(36) + DOIT(37) DOIT(38) DOIT(39) DOIT(40) DOIT(41) DOIT(42) + DOIT(43) DOIT(44) DOIT(45) DOIT(46) DOIT(47) DOIT(48) + DOIT(49) DOIT(50) DOIT(51) DOIT(52) DOIT(53) DOIT(54) + DOIT(55) DOIT(56) DOIT(57) DOIT(58) DOIT(59) DOIT(60) + DOIT(61) DOIT(62) DOIT(63) DOIT(64) DOIT(65) DOIT(66) + DOIT(67) DOIT(68) DOIT(69) DOIT(70) DOIT(71) DOIT(72) + DOIT(73) DOIT(74) DOIT(75) DOIT(76) DOIT(77) DOIT(78) + DOIT(79) DOIT(80) DOIT(81) DOIT(82) DOIT(83) DOIT(84) + DOIT(85) DOIT(86) DOIT(87) DOIT(88) DOIT(89) DOIT(90) + DOIT(91) DOIT(92) DOIT(93) DOIT(94) DOIT(95) DOIT(96) + DOIT(97) DOIT(98) DOIT(99) DOIT(100) DOIT(101) DOIT(102) + DOIT(103) DOIT(104) DOIT(105) DOIT(106) DOIT(107) + DOIT(108) DOIT(109) DOIT(110) DOIT(111) DOIT(112) + DOIT(113) DOIT(114) DOIT(115) DOIT(116) DOIT(117) + DOIT(118) DOIT(119) DOIT(120) DOIT(121) DOIT(122) + DOIT(123) DOIT(124) DOIT(125) DOIT(126) p[127]; + p += 128; + } + } + use_int(sum); +} +#undef DOIT + +void +fcp(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register TYPE *lastone = state->lastone; + + while (iterations-- > 0) { + register TYPE *p = state->buf; + register TYPE *dst = state->buf2; + while (p <= lastone) { +#define DOIT(i) dst[i]=p[i]; + DOIT(0) DOIT(1) DOIT(2) DOIT(3) DOIT(4) DOIT(5) DOIT(6) + DOIT(7) DOIT(8) DOIT(9) DOIT(10) DOIT(11) DOIT(12) + DOIT(13) DOIT(14) DOIT(15) DOIT(16) DOIT(17) DOIT(18) + DOIT(19) DOIT(20) DOIT(21) DOIT(22) DOIT(23) DOIT(24) + DOIT(25) DOIT(26) DOIT(27) DOIT(28) DOIT(29) DOIT(30) + DOIT(31) DOIT(32) DOIT(33) DOIT(34) DOIT(35) DOIT(36) + DOIT(37) DOIT(38) DOIT(39) DOIT(40) DOIT(41) DOIT(42) + DOIT(43) DOIT(44) DOIT(45) DOIT(46) DOIT(47) DOIT(48) + DOIT(49) DOIT(50) DOIT(51) DOIT(52) DOIT(53) DOIT(54) + DOIT(55) DOIT(56) DOIT(57) DOIT(58) DOIT(59) DOIT(60) + DOIT(61) DOIT(62) DOIT(63) DOIT(64) DOIT(65) DOIT(66) + DOIT(67) DOIT(68) DOIT(69) DOIT(70) DOIT(71) DOIT(72) + DOIT(73) DOIT(74) DOIT(75) DOIT(76) DOIT(77) DOIT(78) + DOIT(79) DOIT(80) DOIT(81) DOIT(82) DOIT(83) DOIT(84) + DOIT(85) DOIT(86) DOIT(87) DOIT(88) DOIT(89) DOIT(90) + DOIT(91) DOIT(92) DOIT(93) DOIT(94) DOIT(95) DOIT(96) + DOIT(97) DOIT(98) DOIT(99) DOIT(100) DOIT(101) DOIT(102) + DOIT(103) DOIT(104) DOIT(105) DOIT(106) DOIT(107) + DOIT(108) DOIT(109) DOIT(110) DOIT(111) DOIT(112) + DOIT(113) DOIT(114) DOIT(115) DOIT(116) DOIT(117) + DOIT(118) DOIT(119) DOIT(120) DOIT(121) DOIT(122) + DOIT(123) DOIT(124) DOIT(125) DOIT(126) DOIT(127) + p += 128; + dst += 128; + } + } +} + +void +loop_bzero(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register TYPE *p = state->buf; + register TYPE *dst = state->buf2; + register size_t N = state->N; + + while (iterations-- > 0) { + bzero(p, N); + } +} + +void +loop_bcopy(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register TYPE *p = state->buf; + register TYPE *dst = state->buf2; + register size_t N = state->N; + + while (iterations-- > 0) { + bcopy(p,dst,N); + } +} + +/* + * Almost like bandwidth() in lib_timing.c, but we need to adjust + * bandwidth based upon loop overhead. + */ +void adjusted_bandwidth(uint64 time, uint64 bytes, uint64 iter, double overhd) +{ +#define MB (1000. * 1000.) + extern FILE *ftiming; + double secs = ((double)time / (double)iter - overhd) / 1000000.0; + double mb; + + mb = bytes / MB; + + if (secs <= 0.) + return; + + if (!ftiming) ftiming = stderr; + if (mb < 1.) { + (void) fprintf(ftiming, "%.6f ", mb); + } else { + (void) fprintf(ftiming, "%.2f ", mb); + } + if (mb / secs < 1.) { + (void) fprintf(ftiming, "%.6f\n", mb/secs); + } else { + (void) fprintf(ftiming, "%.2f\n", mb/secs); + } +} + + diff --git a/performance/lmbench3/src/bw_mmap_rd.c b/performance/lmbench3/src/bw_mmap_rd.c new file mode 100644 index 0000000..03c27b1 --- /dev/null +++ b/performance/lmbench3/src/bw_mmap_rd.c @@ -0,0 +1,185 @@ +/* + * bw_mmap_rd.c - time reading & summing of a file using mmap + * + * Usage: bw_mmap_rd [-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size file + * + * Sizes less than 2m are not recommended. Memory is read by summing it up + * so the numbers include the cost of the adds. If you use sizes large + * enough, you can compare to bw_mem_rd and get the cost of TLB fills + * (very roughly). + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" +#ifdef MAP_FILE +# define MMAP_FLAGS MAP_FILE|MAP_SHARED +#else +# define MMAP_FLAGS MAP_SHARED +#endif + +#define TYPE int +#define MINSZ (sizeof(TYPE) * 128) +#define CHK(x) if ((long)(x) == -1) { perror("x"); exit(1); } + +typedef struct _state { + size_t nbytes; + char filename[256]; + int fd; + int clone; + void *buf; +} state_t; + +void time_no_open(iter_t iterations, void * cookie); +void time_with_open(iter_t iterations, void * cookie); +void initialize(iter_t iterations, void *cookie); +void init_open(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); + +int +main(int ac, char **av) +{ + int fd; + struct stat sbuf; + void *buf; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + size_t nbytes; + state_t state; + int c; + char *usage = "[-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] <size> open2close|mmap_only <filename>"; + + state.clone = 0; + + while (( c = getopt(ac, av, "P:W:N:C")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + case 'C': + state.clone = 1; + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + /* should have three arguments left (bytes type filename) */ + if (optind + 3 != ac) { + lmbench_usage(ac, av, usage); + } + + nbytes = state.nbytes = bytes(av[optind]); + strcpy(state.filename,av[optind+2]); + CHK(stat(state.filename, &sbuf)); + if ((S_ISREG(sbuf.st_mode) && nbytes > sbuf.st_size) + || (nbytes < MINSZ)) { + fprintf(stderr,"<size> out of range!\n"); + exit(1); + } + + if (!strcmp("open2close", av[optind+1])) { + benchmp(initialize, time_with_open, cleanup, + 0, parallel, warmup, repetitions, &state); + } else if (!strcmp("mmap_only", av[optind+1])) { + benchmp(init_open, time_no_open, cleanup, + 0, parallel, warmup, repetitions, &state); + } else { + lmbench_usage(ac, av, usage); + } + bandwidth(nbytes, get_n() * parallel, 0); + return (0); +} + +void +initialize(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + state->fd = -1; + state->buf = NULL; + + if (state->clone) { + char buf[8192]; + char* s; + + /* copy original file into a process-specific one */ + sprintf(buf, "%d", (int)getpid()); + s = (char*)malloc(strlen(state->filename) + strlen(buf) + 1); + sprintf(s, "%s%d", state->filename, (int)getpid()); + if (cp(state->filename, s, S_IREAD|S_IWRITE) < 0) { + perror("creating private tempfile"); + unlink(s); + exit(1); + } + strcpy(state->filename, s); + } +} + +void +init_open(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + initialize(0, cookie); + CHK(state->fd = open(state->filename, 0)); + CHK(state->buf = mmap(0, state->nbytes, PROT_READ, + MMAP_FLAGS, state->fd, 0)); +} + +void +cleanup(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + if (state->buf) munmap(state->buf, state->nbytes); + if (state->fd >= 0) close(state->fd); + if (state->clone) unlink(state->filename); +} + +void +time_no_open(iter_t iterations, void * cookie) +{ + state_t *state = (state_t *) cookie; + + while (iterations-- > 0) { + bread(state->buf, state->nbytes); + } +} + +void +time_with_open(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + char *filename = state->filename; + size_t nbytes = state->nbytes; + int fd; + void *p; + + while (iterations-- > 0) { + CHK(fd = open(filename, 0)); + CHK(p = mmap(0, nbytes, PROT_READ, MMAP_FLAGS, fd, 0)); + bread(p, nbytes); + close(fd); + munmap(p, nbytes); + } +} diff --git a/performance/lmbench3/src/bw_pipe.c b/performance/lmbench3/src/bw_pipe.c new file mode 100644 index 0000000..5d9edfb --- /dev/null +++ b/performance/lmbench3/src/bw_pipe.c @@ -0,0 +1,187 @@ +/* + * bw_pipe.c - pipe bandwidth benchmark. + * + * Usage: bw_pipe [-m <message size>] [-M <total bytes>] \ + * [-P <parallelism>] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 1994 Larry McVoy. + * Copyright (c) 2002 Carl Staelin. + * Distributed under the FSF GPL with additional restriction that results + * may published only if: + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +void reader(iter_t iterations, void* cookie); +void writer(int writefd, char* buf, size_t xfer); + +int XFER = 10*1024*1024; + +struct _state { + int pid; + size_t xfer; /* bytes to read/write per "packet" */ + size_t bytes; /* bytes to read/write in one iteration */ + char *buf; /* buffer memory space */ + int readfd; + int initerr; +}; + +void +initialize(iter_t iterations, void *cookie) +{ + int pipes[2]; + struct _state* state = (struct _state*)cookie; + + if (iterations) return; + + state->initerr = 0; + if (pipe(pipes) == -1) { + perror("pipe"); + state->initerr = 1; + return; + } + handle_scheduler(benchmp_childid(), 0, 1); + switch (state->pid = fork()) { + case 0: + close(pipes[0]); + handle_scheduler(benchmp_childid(), 1, 1); + state->buf = valloc(state->xfer); + if (state->buf == NULL) { + perror("child: no memory"); + state->initerr = 4; + return; + } + touch(state->buf, state->xfer); + writer(pipes[1], state->buf, state->xfer); + return; + /*NOTREACHED*/ + + case -1: + perror("fork"); + state->initerr = 3; + return; + /*NOTREACHED*/ + + default: + break; + } + close(pipes[1]); + state->readfd = pipes[0]; + state->buf = valloc(state->xfer + getpagesize()); + if (state->buf == NULL) { + perror("parent: no memory"); + state->initerr = 4; + return; + } + touch(state->buf, state->xfer + getpagesize()); + state->buf += 128; /* destroy page alignment */ +} + +void +cleanup(iter_t iterations, void * cookie) +{ + struct _state* state = (struct _state*)cookie; + + if (iterations) return; + + close(state->readfd); + if (state->pid > 0) { + kill(state->pid, SIGKILL); + waitpid(state->pid, NULL, 0); + } + state->pid = 0; +} + +void +reader(iter_t iterations, void * cookie) +{ + size_t done; + ssize_t n; + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + for (done = 0; done < state->bytes; done += n) { + if ((n = read(state->readfd, state->buf, state->xfer)) < 0) { + perror("bw_pipe: reader: error in read"); + exit(1); + } + } + } +} + +void +writer(int writefd, char* buf, size_t xfer) +{ + size_t done; + ssize_t n; + + for ( ;; ) { +#ifdef TOUCH + touch(buf, xfer); +#endif + for (done = 0; done < xfer; done += n) { + if ((n = write(writefd, buf, xfer - done)) < 0) { + exit(0); + } + } + } +} + +int +main(int ac, char *av[]) +{ + struct _state state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char* usage = "[-m <message size>] [-M <total bytes>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; + + state.xfer = XFERSIZE; /* per-packet size */ + state.bytes = XFER; /* total bytes per call */ + + while (( c = getopt(ac, av, "m:M:P:W:N:")) != EOF) { + switch(c) { + case 'm': + state.xfer = bytes(optarg); + break; + case 'M': + state.bytes = bytes(optarg); + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind < ac) { + lmbench_usage(ac, av, usage); + } + /* round up total byte count to a multiple of xfer */ + if (state.bytes < state.xfer) { + state.bytes = state.xfer; + } else if (state.bytes % state.xfer) { + state.bytes += state.bytes - state.bytes % state.xfer; + } + benchmp(initialize, reader, cleanup, MEDIUM, parallel, + warmup, repetitions, &state); + + if (gettime() > 0) { + fprintf(stderr, "Pipe bandwidth: "); + mb(get_n() * parallel * state.bytes); + } + return(0); +} diff --git a/performance/lmbench3/src/bw_tcp.c b/performance/lmbench3/src/bw_tcp.c new file mode 100644 index 0000000..6a2e8f7 --- /dev/null +++ b/performance/lmbench3/src/bw_tcp.c @@ -0,0 +1,251 @@ +/* + * bw_tcp.c - simple TCP bandwidth test + * + * Three programs in one - + * server usage: bw_tcp -s + * client usage: bw_tcp [-m <message size>] [-M <total bytes>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname + * shutdown: bw_tcp -hostname + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; +#include "bench.h" + +typedef struct _state { + int sock; + uint64 move; + int msize; + char *server; + int fd; + char *buf; +} state_t; + +void server_main(); +void client_main(int parallel, state_t *state); +void source(int data); + +void initialize(iter_t iterations, void* cookie); +void loop_transfer(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void* cookie); + +int +main(int ac, char **av) +{ + int parallel = 1; + int warmup = LONGER; + int repetitions = TRIES; + int shutdown = 0; + state_t state; + char *usage = "-s\n OR [-m <message size>] [-M <bytes to move>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] server\n OR -S serverhost\n"; + int c; + + state.msize = 0; + state.move = 0; + + /* Rest is client argument processing */ + while (( c = getopt(ac, av, "sS:m:M:P:W:N:")) != EOF) { + switch(c) { + case 's': /* Server */ + if (fork() == 0) { + server_main(); + } + exit(0); + break; + case 'S': /* shutdown serverhost */ + { + int conn; + conn = tcp_connect(optarg, TCP_DATA, SOCKOPT_NONE); + write(conn, "0", 1); + exit(0); + } + case 'm': + state.msize = bytes(optarg); + break; + case 'M': + state.move = bytes(optarg); + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind < ac - 2 || optind >= ac) { + lmbench_usage(ac, av, usage); + } + + state.server = av[optind++]; + + if (state.msize == 0 && state.move == 0) { + state.msize = state.move = XFERSIZE; + } else if (state.msize == 0) { + state.msize = state.move; + } else if (state.move == 0) { + state.move = state.msize; + } + + /* make the number of bytes to move a multiple of the message size */ + if (state.move % state.msize) { + state.move += state.msize - state.move % state.msize; + } + + /* + * Default is to warmup the connection for seven seconds, + * then measure performance over each timing interval. + * This minimizes the effect of opening and initializing TCP + * connections. + */ + benchmp(initialize, loop_transfer, cleanup, + 0, parallel, warmup, repetitions, &state); + if (gettime() > 0) { + fprintf(stderr, "%.6f ", state.msize / (1000. * 1000.)); + mb(state.move * get_n() * parallel); + } +} + +void +initialize(iter_t iterations, void *cookie) +{ + int c; + char buf[100]; + state_t *state = (state_t *) cookie; + + if (iterations) return; + + state->buf = valloc(state->msize); + if (!state->buf) { + perror("valloc"); + exit(1); + } + touch(state->buf, state->msize); + + state->sock = tcp_connect(state->server, TCP_DATA, SOCKOPT_READ|SOCKOPT_WRITE|SOCKOPT_REUSE); + if (state->sock < 0) { + perror("socket connection"); + exit(1); + } + sprintf(buf, "%lu", state->msize); + if (write(state->sock, buf, strlen(buf) + 1) != strlen(buf) + 1) { + perror("control write"); + exit(1); + } +} + +void +loop_transfer(iter_t iterations, void *cookie) +{ + int c; + uint64 todo; + state_t *state = (state_t *) cookie; + + while (iterations-- > 0) { + for (todo = state->move; todo > 0; todo -= c) { + if ((c = read(state->sock, state->buf, state->msize)) <= 0) { + exit(1); + } + if (c > todo) c = todo; + } + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + /* close connection */ + (void)close(state->sock); +} + +void +server_main() +{ + int data, newdata; + + GO_AWAY; + + data = tcp_server(TCP_DATA, SOCKOPT_WRITE|SOCKOPT_REUSE); + if (data < 0) { + perror("server socket creation"); + exit(1); + } + + signal(SIGCHLD, sigchld_wait_handler); + for ( ;; ) { + newdata = tcp_accept(data, SOCKOPT_WRITE); + switch (fork()) { + case -1: + perror("fork"); + break; + case 0: + source(newdata); + exit(0); + default: + close(newdata); + break; + } + } +} + +/* + * Read the message size. Keep transferring + * data in message-size sized packets until + * the socket goes away. + */ +void +source(int data) +{ + size_t count, m; + unsigned long nbytes; + char *buf, scratch[100]; + + /* + * read the message size + */ + bzero(scratch, 100); + if (read(data, scratch, 100) <= 0) { + perror("control nbytes"); + exit(7); + } + sscanf(scratch, "%lu", &nbytes); + m = nbytes; + + /* + * A hack to allow turning off the absorb daemon. + */ + if (m == 0) { + tcp_done(TCP_DATA); + kill(getppid(), SIGTERM); + exit(0); + } + + buf = valloc(m); + bzero(buf, m); + + /* + * Keep sending messages until the connection is closed + */ + while (write(data, buf, m) == m) { +#ifdef TOUCH + touch(buf, m); +#endif + } + free(buf); +} diff --git a/performance/lmbench3/src/bw_udp.c b/performance/lmbench3/src/bw_udp.c new file mode 100644 index 0000000..8479114 --- /dev/null +++ b/performance/lmbench3/src/bw_udp.c @@ -0,0 +1,203 @@ +/* + * bw_udp.c - simple UDP bandwidth test + * + * Three programs in one - + * server usage: bw_tcp -s + * client usage: bw_tcp [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname [bytes] + * shutdown: bw_tcp -S hostname + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; +#include "bench.h" + +#define MAX_MSIZE (10 * 1024 * 1024) + +typedef struct _state { + int sock; + int seq; + long move; + long msize; + char *server; + int fd; + char *buf; +} state_t; + +void server_main(); +void client_main(int parallel, state_t *state); +void init(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); + +void loop_transfer(iter_t iterations, void *cookie); + +int +main(int ac, char **av) +{ + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int server = 0; + state_t state; + char *usage = "-s\n OR [-m <message size>] [-W <warmup>] [-N <repetitions>] server [size]\n OR -S serverhost\n"; + int c; + uint64 usecs; + + state.msize = 0; + state.move = 10*1024*1024; + + /* Rest is client argument processing */ + while (( c = getopt(ac, av, "sS:m:W:N:")) != EOF) { + switch(c) { + case 's': /* Server */ + if (fork() == 0) { + server_main(); + } + exit(0); + case 'S': /* shutdown serverhost */ + { + int seq, n; + int sock = udp_connect(optarg, + UDP_XACT, + SOCKOPT_NONE); + for (n = -1; n > -5; --n) { + seq = htonl(n); + (void) send(sock, &seq, sizeof(int), 0); + } + close(sock); + exit (0); + } + case 'm': + state.msize = atoi(optarg); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind < ac - 2 || optind >= ac) { + lmbench_usage(ac, av, usage); + } + + state.server = av[optind++]; + if (optind < ac) { + state.move = bytes(av[optind]); + } + if (state.msize == 0) { + state.msize = state.move; + } + /* make the number of bytes to move a multiple of the message size */ + if (state.move % state.msize) { + state.move += state.move - state.move % state.msize; + } + + state.buf = valloc(state.msize); + if (!state.buf) { + perror("valloc"); + exit(1); + } + touch(state.buf, state.msize); + + /* + * Make one run take at least 5 seconds. + * This minimizes the effect of connect & reopening TCP windows. + */ + benchmp(init, loop_transfer, cleanup, LONGER, parallel, warmup, repetitions, &state ); + +out: (void)fprintf(stderr, "socket UDP bandwidth using %s: ", state.server); + mb(state.move * get_n() * parallel); +} + +void +init(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + state->sock = udp_connect(state->server, UDP_XACT, SOCKOPT_NONE); + state->seq = 0; + state->buf = (char*)malloc(state->msize); +} + +void +loop_transfer(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + char *server = state->server; + int sock = state->sock; + long control[2], nbytes; + + nbytes = state->move; + control[0] = state->move; + control[1] = state->msize; + + while (iterations-- > 0) { + if (send(sock, control, 2 * sizeof(long), 0) != 2 * sizeof(long)) { + perror("bw_udp client: send failed"); + exit(5); + } + while (nbytes > 0) { + if (recv(sock, state->buf, state->msize, 0) != state->msize) { + perror("bw_udp client: recv failed"); + exit(5); + } + nbytes -= state->msize; + } + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + close(state->sock); + free(state->buf); +} + +void +server_main() +{ + char *buf = (char*)valloc(MAX_MSIZE); + int sock, namelen, seq = 0; + long nbytes, msize; + struct sockaddr_in it; + + GO_AWAY; + + sock = udp_server(UDP_XACT, SOCKOPT_NONE); + + while (1) { + namelen = sizeof(it); + if (recvfrom(sock, (void*)buf, 2 * sizeof(long), 0, + (struct sockaddr*)&it, &namelen) < 0) { + fprintf(stderr, "bw_udp server: recvfrom: got wrong size\n"); + exit(9); + } + nbytes = ntohl(*(long*)buf); + msize = ntohl(*((long*)buf + 1)); + while (nbytes > 0) { + if (sendto(sock, (void*)buf, msize, 0, + (struct sockaddr*)&it, sizeof(it)) < 0) { + perror("bw_udp sendto"); + exit(9); + } + nbytes -= msize; + } + } +} + diff --git a/performance/lmbench3/src/bw_unix.c b/performance/lmbench3/src/bw_unix.c new file mode 100644 index 0000000..aad2078 --- /dev/null +++ b/performance/lmbench3/src/bw_unix.c @@ -0,0 +1,190 @@ +/* + * bw_unix.c - simple Unix stream socket bandwidth test + * + * Usage: bw_unix [-m <message size>] [-M <total bytes>] \ + * [-P <parallelism>] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 1994 Larry McVoy. + * Copyright (c) 2002 Carl Staelin. + * Distributed under the FSF GPL with additional restriction that results + * may published only if: + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +void reader(iter_t iterations, void * cookie); +void writer(int controlfd, int writefd, char* buf, void* cookie); + +size_t XFER = 10*1024*1024; + +struct _state { + int pid; + size_t xfer; /* bytes to read/write per "packet" */ + size_t bytes; /* bytes to read/write in one iteration */ + char *buf; /* buffer memory space */ + int pipes[2]; + int control[2]; + int initerr; +}; + +void +initialize(iter_t iterations, void* cookie) +{ + struct _state* state = (struct _state*)cookie; + + if (iterations) return; + + state->buf = valloc(XFERSIZE); + touch(state->buf, XFERSIZE); + state->initerr = 0; + if (socketpair(AF_UNIX, SOCK_STREAM, 0, state->pipes) == -1) { + perror("socketpair"); + state->initerr = 1; + return; + } + if (pipe(state->control) == -1) { + perror("pipe"); + state->initerr = 2; + return; + } + handle_scheduler(benchmp_childid(), 0, 1); + switch (state->pid = fork()) { + case 0: + handle_scheduler(benchmp_childid(), 1, 1); + close(state->control[1]); + close(state->pipes[0]); + writer(state->control[0], state->pipes[1], state->buf, state); + return; + /*NOTREACHED*/ + + case -1: + perror("fork"); + state->initerr = 3; + return; + /*NOTREACHED*/ + + default: + break; + } + close(state->control[0]); + close(state->pipes[1]); +} +void +cleanup(iter_t iterations, void* cookie) +{ + struct _state* state = (struct _state*)cookie; + + if (iterations) return; + + close(state->control[1]); + close(state->pipes[0]); + if (state->pid > 0) { + kill(state->pid, SIGKILL); + waitpid(state->pid, NULL, 0); + } + state->pid = 0; +} + +void +reader(iter_t iterations, void* cookie) +{ + struct _state* state = (struct _state*)cookie; + size_t done, n; + size_t todo = state->bytes; + + while (iterations-- > 0) { + write(state->control[1], &todo, sizeof(todo)); + for (done = 0; done < todo; done += n) { + if ((n = read(state->pipes[0], state->buf, state->xfer)) <= 0) { + /* error! */ + exit(1); + } + } + } +} + +void +writer(int controlfd, int writefd, char* buf, void* cookie) +{ + size_t todo, n, done; + struct _state* state = (struct _state*)cookie; + + for ( ;; ) { + read(controlfd, &todo, sizeof(todo)); + for (done = 0; done < todo; done += n) { +#ifdef TOUCH + touch(buf, XFERSIZE); +#endif + if ((n = write(writefd, buf, state->xfer)) < 0) { + /* error! */ + exit(1); + } + } + } +} + +int +main(int argc, char *argv[]) +{ + struct _state state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char* usage = "[-m <message size>] [-M <total bytes>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; + + state.xfer = XFERSIZE; /* per-packet size */ + state.bytes = XFER; /* total bytes per call */ + + while (( c = getopt(argc,argv,"m:M:P:W:N:")) != EOF) { + switch(c) { + case 'm': + state.xfer = bytes(optarg); + break; + case 'M': + state.bytes = bytes(optarg); + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(argc, argv, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(argc, argv); + break; + } + } + if (optind == argc - 1) { + state.bytes = bytes(argv[optind]); + } else if (optind < argc - 1) { + lmbench_usage(argc, argv); + } + + state.pid = 0; + + /* round up total byte count to a multiple of xfer */ + if (state.bytes % state.xfer) { + state.bytes += state.bytes - state.bytes % state.xfer; + } + + benchmp(initialize, reader, cleanup, MEDIUM, parallel, + warmup, repetitions, &state); + + if (gettime() > 0) { + fprintf(stderr, "AF_UNIX sock stream bandwidth: "); + mb(get_n() * parallel * XFER); + } + return(0); +} + + + diff --git a/performance/lmbench3/src/cache.c b/performance/lmbench3/src/cache.c new file mode 100644 index 0000000..7bc1651 --- /dev/null +++ b/performance/lmbench3/src/cache.c @@ -0,0 +1,750 @@ +/* + * cache.c - guess the cache size(s) + * + * usage: cache [-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + + +struct cache_results { + int len; + int maxlen; + int line; + int mline; + double latency; + double variation; + double ratio; + double slope; +}; + +int find_cache(int start, int n, struct cache_results* p); +int collect_data(int start, int line, int maxlen, + int repetitions, struct cache_results** pdata); +void search(int left, int right, int repetitions, + struct mem_state* state, struct cache_results* p); +int collect_sample(int repetitions, struct mem_state* state, + struct cache_results* p); +double measure(int size, int repetitions, + double* variation, struct mem_state* state); +double remove_chunk(int i, int chunk, int npages, size_t* pages, + int len, int repetitions, struct mem_state* state); +int test_chunk(int i, int chunk, int npages, size_t* pages, int len, + double *baseline, double chunk_baseline, + int repetitions, struct mem_state* state); +int fixup_chunk(int i, int chunk, int npages, size_t* pages, int len, + double *baseline, double chunk_baseline, + int repetitions, struct mem_state* state); +void check_memory(int size, struct mem_state* state); +void pagesort(int n, size_t* pages, double* latencies); + +#ifdef ABS +#undef ABS +#endif +#define ABS(a) ((a) < 0 ? -(a) : (a)) + +#define SWAP(a,b) {int _tmp = (a); (a) = (b); (b) = _tmp;} + +#define THRESHOLD 1.5 + +#define FIVE(m) m m m m m +#define TEN(m) FIVE(m) FIVE(m) +#define FIFTY(m) TEN(m) TEN(m) TEN(m) TEN(m) TEN(m) +#define HUNDRED(m) FIFTY(m) FIFTY(m) +#define DEREF p = (char**)*p; + +static char **addr_save = NULL; + +void +mem_benchmark(iter_t iterations, void *cookie) +{ + register char **p; + struct mem_state* state = (struct mem_state*)cookie; + + p = addr_save ? addr_save : (char**)state->p[0]; + while (iterations-- > 0) { + HUNDRED(DEREF); + } + addr_save = p; +} + + +/* + * Assumptions: + * + * 1) Cache lines are a multiple of pointer-size words + * 2) Cache lines are no larger than 1/8 of a page (typically 512 bytes) + * 3) Pages are an even multiple of cache lines + */ +int +main(int ac, char **av) +{ + int c; + int i, j, n, start, level, prev, min; + int line = -1; + int warmup = 0; + int repetitions = TRIES; + int print_cost = 0; + int maxlen = 32 * 1024 * 1024; + int *levels; + double par, maxpar; + char *usage = "[-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]\n"; + struct cache_results* r; + struct mem_state state; + + while (( c = getopt(ac, av, "cL:M:W:N:")) != EOF) { + switch(c) { + case 'c': + print_cost = 1; + break; + case 'L': + line = atoi(optarg); + if (line < sizeof(char*)) + line = sizeof(char*); + break; + case 'M': + maxlen = bytes(optarg); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + state.width = 1; + state.len = maxlen; + state.maxlen = maxlen; + state.pagesize = getpagesize(); + + if (line <= 0) { + line = line_find(maxlen, warmup, repetitions, &state); + if (line <= 0) + line = getpagesize() / 16; + state.line = line; + } + + n = collect_data(512, line, maxlen, repetitions, &r); + r[n-1].line = line; + levels = (int*)malloc(n * sizeof(int)); + bzero(levels, n * sizeof(int)); + + for (start = 0, prev = 0, level = 0; + (i = find_cache(start, n, r)) >= 0; + ++level, start = i + 1, prev = i) + { + /* + * performance is not greatly improved over main memory, + * so it is likely not a cache boundary + */ + if (r[i].latency / r[n-1].latency > 0.5) break; + + /* + * is cache boundary "legal"? (e.g. 2^N or 1.5*2^N) + * cache sizes are "never" 1.25*2^N or 1.75*2^N + */ + for (c = r[i].len; c > 0x7; c >>= 1) + ; + if (c == 5 || c == 7) { + i++; + if (i >= n) break; + } + + levels[level] = i; + } + + for (i = 0; i < level; ++i) { + prev = (i > 0 ? levels[i-1]: -1); + + /* locate most likely cache latency */ + for (j = min = prev + 1; j < levels[i]; ++j) { + if (r[j].latency <= 0.) continue; + if (r[min].latency <= 0. + || ABS(r[j].slope) < ABS(r[min].slope)) { + min = j; + } + } + + /* Compute line size */ + if (i == level - 1) { + line = r[n-1].line; + } else { + j = (levels[i] + levels[i+1]) / 2; + for (line = -1; line <= 0 && j < n; ++j) { + r[j].line = line_find(r[j].len, warmup, + repetitions, &state); + line = r[j].line; + } + } + + /* Compute memory parallelism for cache */ + maxpar = par_mem(r[levels[i]-1].len, warmup, + repetitions, &state); + + fprintf(stderr, + "L%d cache: %d bytes %.2f nanoseconds %d linesize %.2f parallelism\n", + i+1, r[levels[i]].len, r[min].latency, line, maxpar); + } + + /* Compute memory parallelism for main memory */ + j = n - 1; + for (i = n - 1; i >= 0; i--) { + if (r[i].latency < 0.) continue; + if (r[i].latency > 0.99 * r[n-1].latency) + j = i; + } + par = par_mem(r[j].len, warmup, repetitions, &state); + + fprintf(stderr, "Memory latency: %.2f nanoseconds %.2f parallelism\n", + r[n-1].latency, par); + + exit(0); +} + +int +find_cache(int start, int n, struct cache_results* p) +{ + int i, j, prev; + double max = -1.; + + for (prev = (start == 0 ? start : start - 1); prev > 0; prev--) { + if (p[prev].ratio > 0.0) break; + } + + for (i = start, j = -1; i < n; ++i) { + if (p[i].latency < 0.) continue; + if (p[prev].ratio <= p[i].ratio && p[i].ratio > max) { + j = i; + max = p[i].ratio; + } else if (p[i].ratio < max && THRESHOLD < max) { + return j; + } + prev = i; + } + return -1; +} + +int +collect_data(int start, int line, int maxlen, + int repetitions, struct cache_results** pdata) +{ + int i; + int samples; + int idx; + int len = start; + int incr = start / 4; + double latency; + double variation; + struct mem_state state; + struct cache_results* p; + + + state.width = 1; + state.len = maxlen; + state.maxlen = maxlen; + state.line = line; + state.pagesize = getpagesize(); + state.addr = NULL; + + /* count the (maximum) number of samples to take */ + for (len = start, incr = start / 4, samples = 0; len <= maxlen; incr<<=1) { + for (i = 0; i < 4 && len <= maxlen; ++i, len += incr) + samples++; + } + *pdata = (struct cache_results*) + malloc(samples * sizeof(struct cache_results)); + + p = *pdata; + + /* initialize the data */ + for (len = start, incr = start / 4, idx = 0; len <= maxlen; incr<<=1) { + for (i = 0; i < 4 && len <= maxlen; ++i, ++idx, len += incr) { + p[idx].len = len; + p[idx].line = -1; + p[idx].mline = -1; + p[idx].latency = -1.; + p[idx].ratio = -1.; + p[idx].slope = -1.; + } + } + + /* make sure we have enough memory for the scratch data */ + while (state.addr == NULL) { + mem_initialize(0, &state); + if (state.addr == NULL) { + maxlen /= 2; + state.len = state.maxlen = maxlen; + while (p[samples-1].len > maxlen) + samples--; + } + } + for (i = 0; i < samples; ++i) + p[i].maxlen = maxlen; + /* in case the system has laid out the pages well, don't scramble */ + for (i = 0; i < state.npages; ++i) + state.pages[i] = i * state.pagesize; + + p[0].latency = measure(p[0].len, repetitions, &p[0].variation, &state); + p[samples-1].latency = measure(p[samples-1].len, repetitions, + &p[samples-1].variation, &state); + while (p[samples-1].latency <= 0.0) { + p[samples-1].latency = measure(p[samples-1].len, + repetitions, + &p[samples-1].variation, + &state); + --samples; + } + search(0, samples - 1, repetitions, &state, p); + + /* + fprintf(stderr, "%10.10s %8.8s %8.8s %8.8s %8.8s %5.5s %5.5s\n", + "mem size", "latency", "variation", "ratio", "slope", + "line", "mline"); + for (idx = 0; idx < samples; ++idx) { + if (p[idx].latency < 0.) continue; + fprintf(stderr, + "%10.6f %8.3f %8.3f %8.3f %8.3f %4d %4d\n", + p[idx].len / (1000. * 1000.), + p[idx].latency, + p[idx].variation, + p[idx].ratio, + p[idx].slope, + p[idx].line, + p[idx].mline); + } + /**/ + mem_cleanup(0, &state); + + return samples; +} + +void +search(int left, int right, int repetitions, + struct mem_state* state, struct cache_results* p) +{ + int middle = left + (right - left) / 2; + + if (p[left].latency > 0.0) { + p[left].ratio = p[right].latency / p[left].latency; + p[left].slope = (p[left].ratio - 1.) / (double)(right - left); + /* we probably have a bad data point, so ignore it */ + if (p[left].ratio < 0.98) { + p[left].latency = p[right].latency; + p[left].ratio = 1.; + p[left].slope = 0.; + } + } + + if (middle == left || middle == right) + return; + + if (p[left].ratio > 1.35 || p[left].ratio < 0.97) { + collect_sample(repetitions, state, &p[middle]); + search(middle, right, repetitions, state, p); + search(left, middle, repetitions, state, p); + } + return; +} + +int +collect_sample(int repetitions, struct mem_state* state, + struct cache_results* p) +{ + int i, modified, npages; + double baseline; + + npages = (p->len + getpagesize() - 1) / getpagesize(); + baseline = measure(p->len, repetitions, &p->variation, state); + + if (npages > 1) { + for (i = 0, modified = 1; i < 8 && modified; ++i) { + modified = test_chunk(0, npages, npages, + state->pages, p->len, + &baseline, 0.0, + repetitions, state); + } + } + p->latency = baseline; + + return (p->latency > 0); +} + +double +measure(int size, int repetitions, + double* variation, struct mem_state* state) +{ + int i, j, npages, nlines; + double time, median; + char *p; + result_t *r, *r_save; + size_t *pages; + + pages = state->pages; + npages = (size + getpagesize() - 1) / getpagesize(); + nlines = state->nlines; + + if (size % getpagesize()) + nlines = (size % getpagesize()) / state->line; + + r_save = get_results(); + r = (result_t*)malloc(sizeof_result(repetitions)); + insertinit(r); + + /* + * assumes that you have used mem_initialize() to setup the memory + */ + p = state->base; + for (i = 0; i < npages - 1; ++i) { + for (j = 0; j < state->nwords; ++j) { + *(char**)(p + pages[i] + state->lines[state->nlines - 1] + state->words[j]) = + p + pages[i+1] + state->lines[0] + state->words[j]; + } + } + for (j = 0; j < state->nwords; ++j) { + *(char**)(p + pages[npages - 1] + state->lines[nlines - 1] + state->words[j]) = + p + pages[0] + state->lines[0] + state->words[(j+1)%state->nwords]; + } + + /* + check_memory(size, state); + /**/ + + addr_save = NULL; + state->p[0] = p + pages[0] + state->lines[0] + state->words[0]; + /* now, run through the chain once to clear the cache */ + mem_benchmark((size / sizeof(char*) + 100) / 100, state); + + for (i = 0; i < repetitions; ++i) { + BENCH1(mem_benchmark(__n, state); __n = 1;, 0) + insertsort(gettime(), get_n(), r); + } + set_results(r); + median = (1000. * (double)gettime()) / (100. * (double)get_n()); + + save_minimum(); + time = (1000. * (double)gettime()) / (100. * (double)get_n()); + + /* Are the results stable, or do they vary? */ + if (time != 0.) + *variation = median / time; + else + *variation = -1.0; + set_results(r_save); + free(r); + + if (nlines < state->nlines) { + for (j = 0; j < state->nwords; ++j) { + *(char**)(p + pages[npages - 1] + state->lines[nlines - 1] + state->words[j]) = + p + pages[npages - 1] + state->lines[nlines] + state->words[j]; + } + } + /* + fprintf(stderr, "%.6f %.2f\n", state->len / (1000. * 1000.), median); + /**/ + + return median; +} + + +double +remove_chunk(int i, int chunk, int npages, size_t* pages, + int len, int repetitions, struct mem_state* state) +{ + int n, j; + double t, var; + + if (i + chunk < npages) { + for (j = 0; j < chunk; ++j) { + n = pages[i+j]; + pages[i+j] = pages[npages-1-j]; + pages[npages-1-j] = n; + } + } + t = measure(len - chunk * getpagesize(), repetitions, &var, state); + if (i + chunk < npages) { + for (j = 0; j < chunk; ++j) { + n = pages[i+j]; + pages[i+j] = pages[npages-1-j]; + pages[npages-1-j] = n; + } + } + + return t; +} + +int +test_chunk(int i, int chunk, int npages, size_t* pages, int len, + double *baseline, double chunk_baseline, + int repetitions, struct mem_state* state) +{ + int j, k, subchunk; + int modified = 0; + int changed; + double t, tt, nodiff_chunk_baseline; + + if (chunk <= 20 && chunk < npages) { + return fixup_chunk(i, chunk, npages, pages, len, baseline, + chunk_baseline, repetitions, state); + } + + nodiff_chunk_baseline = *baseline; + subchunk = (chunk + 19) / 20; + for (j = i, k = 0; j < i + chunk; j+=subchunk, k++) { + if (j + subchunk > i + chunk) subchunk = i + chunk - j; + + t = remove_chunk(j, subchunk, npages, pages, + len, repetitions, state); + + /* + fprintf(stderr, "test_chunk(...): baseline=%G, t=%G, len=%d, chunk=%d, i=%d\n", *baseline, t, len, subchunk, j); + /**/ + + if (t >= 0.99 * *baseline) continue; + if (t >= 0.999 * nodiff_chunk_baseline) continue; + + tt = remove_chunk(j, subchunk, npages, pages, + len, repetitions, state); + + if (tt > t) t = tt; + + if (t >= 0.99 * *baseline) continue; + if (t >= 0.999 * nodiff_chunk_baseline) continue; + + changed = test_chunk(j, subchunk, npages, pages, len, + baseline, t, repetitions, state); + + if (changed) { + modified = 1; + } else { + nodiff_chunk_baseline = t; + } + } + return modified; +} + +/* + * This routine is called once we have identified a chunk + * that has pages that are suspected of colliding with other + * pages. + * + * The algorithm is to remove all the pages, and then + * slowly add back pages; attempting to add pages with + * minimal cost. + */ +int +fixup_chunk(int i, int chunk, int npages, size_t* pages, int len, + double *baseline, double chunk_baseline, + int repetitions, struct mem_state* state) +{ + int j, k, l, m; + int page, substitute, original; + int ntotalpages, nsparepages; + int subset_len; + int swapped = 0; + size_t *pageset; + size_t *saved_pages; + static int available_index = 0; + double t, tt, low, var, new_baseline; + double latencies[20]; + + ntotalpages = state->maxlen / getpagesize(); + nsparepages = ntotalpages - npages; + pageset = state->pages + npages; + new_baseline = *baseline; + + saved_pages = (size_t*)malloc(sizeof(size_t) * ntotalpages); + bcopy(pages, saved_pages, sizeof(int) * ntotalpages); + + /* move everything to the end of the page list */ + if (i + chunk < npages) { + for (j = 0; j < chunk; ++j) { + page = pages[i+j]; + pages[i+j] = pages[npages-chunk+j]; + pages[npages-chunk+j] = page; + } + } + + if (available_index >= nsparepages) available_index = 0; + + /* + * first try to identify which pages we can definitely keep + */ + for (j = 0, k = chunk; j < k; ) { + + t = measure((npages - chunk + j + 1) * getpagesize(), + repetitions, &var, state); + + if (0.995 * t <= chunk_baseline) { + latencies[j] = t; + ++j; /* keep this page */ + } else { + --k; /* this page is probably no good */ + latencies[k] = t; + SWAP(pages[npages - chunk + j], pages[npages - chunk + k]); + } + } + /* + * sort the "bad" pages by increasing latency + */ + pagesort(chunk - j, &pages[npages - chunk + j], &latencies[j]); + + /* + fprintf(stderr, "fixup_chunk: len=%d, chunk=%d, j=%d, baseline=%G, lat[%d]=%G..%G\n", len, chunk, j, *baseline, j, (j < chunk ? latencies[j] : -1.0), latencies[chunk - 1]); + /**/ + + if (chunk >= npages && j < chunk / 2) { + j = chunk / 2; + t = measure((npages - chunk + j + 1) * getpagesize(), + repetitions, &var, state); + chunk_baseline = t; + } + + for (k = 0; j < chunk && k < 2 * npages; ++k) { + original = npages - chunk + j; + substitute = nsparepages - 1; + substitute -= (k + available_index) % (nsparepages - 1); + subset_len = (original + 1) * getpagesize(); + if (j == chunk - 1 && len % getpagesize()) { + subset_len = len; + } + + SWAP(pages[original], pageset[substitute]); + t = measure(subset_len, repetitions, &var, state); + SWAP(pages[original], pageset[substitute]); + + /* + * try to keep pages ordered by increasing latency + */ + if (t < latencies[chunk - 1]) { + latencies[chunk - 1] = t; + SWAP(pages[npages - 1], pageset[substitute]); + pagesort(chunk - j, + &pages[npages - chunk + j], &latencies[j]); + } + if (0.995 * latencies[j] <= chunk_baseline) { + ++j; /* keep this page */ + ++swapped; + } + } + + available_index = (k + available_index) % (nsparepages - 1); + + /* measure new baseline, in case we didn't manage to optimally + * replace every page + */ + if (swapped) { + new_baseline = measure(len, repetitions, &var, state); + + /* + fprintf(stderr, "fixup_chunk: len=%d, swapped=%d, k=%d, baseline=%G, newbase=%G\n", len, swapped, k, *baseline, new_baseline); + /**/ + + if (new_baseline >= 0.999 * *baseline) { + /* no benefit to these changes; back them out */ + swapped = 0; + bcopy(saved_pages, pages, sizeof(int) * ntotalpages); + } else { + /* we sped up, so keep these changes */ + *baseline = new_baseline; + + /* move back to the middle of the pagelist */ + if (i + chunk < npages) { + for (j = 0; j < chunk; ++j) { + page = pages[i+j]; + pages[i+j] = pages[npages-chunk+j]; + pages[npages-chunk+j] = page; + } + } + } + /* + } else { + fprintf(stderr, "fixup_chunk: len=%d, swapped=%d, k=%d\n", len, swapped, k); + /**/ + } + free(saved_pages); + + return swapped; +} + +void +check_memory(int size, struct mem_state* state) +{ + int i, j, first_page, npages, nwords; + int page, word_count, pagesize; + off_t offset; + char **p, **q; + char **start; + + pagesize = getpagesize(); + npages = (size + pagesize - 1) / pagesize; + nwords = size / sizeof(char*); + + /* + fprintf(stderr, "check_memory(%d, ...): entering, %d words\n", size, nwords); + /**/ + word_count = 1; + first_page = 0; + start = (char**)(state->base + state->pages[0] + state->lines[0] + state->words[0]); + for (q = p = (char**)*start; p != start; ) { + word_count++; + offset = (unsigned long)p - (unsigned long)state->base; + page = offset - offset % pagesize; + for (j = first_page; j < npages; ++j) { + if (page == state->pages[j]) break; + } + if (j == npages) { + for (j = 0; j < first_page; ++j) { + if (page == state->pages[j]) break; + } + if (j == first_page) { + fprintf(stderr, + "check_memory: bad memory reference for size %d\n", + size); + } + } + first_page = j % npages; + p = (char**)*p; + if (word_count & 0x1) q == (char**)*q; + if (*p == *q) { + fprintf(stderr, "check_memory: unwanted memory cycle! page=%d\n", j); + return; + } + } + if (word_count != nwords) { + fprintf(stderr, "check_memory: wrong word count, expected %d, got %d\n", nwords, word_count); + } + /* + fprintf(stderr, "check_memory(%d, ...): exiting\n", size); + /**/ +} + +void +pagesort(int n, size_t* pages, double* latencies) +{ + int i, j; + double t; + + for (i = 0; i < n - 1; ++i) { + for (j = i + 1; j < n; ++j) { + if (latencies[i] > latencies[j]) { + t = latencies[i]; + latencies[i] = latencies[j]; + latencies[j] = t; + SWAP(pages[i], pages[j]); + } + } + } +} diff --git a/performance/lmbench3/src/clock.c b/performance/lmbench3/src/clock.c new file mode 100644 index 0000000..48ff8a0 --- /dev/null +++ b/performance/lmbench3/src/clock.c @@ -0,0 +1,24 @@ +/* + * clock.c + * + * calculate the minimum timing loop length that gives us significant results + */ +#include "bench.h" + +char *id = "$Id$"; +char *revision = "$Revision$"; + +main() +{ + uint64 enough; + double t_overhead, l_overhead; + + enough = compute_enough(15); + printf("ENOUGH=%lu\n", (unsigned long)enough); fflush(stdout); + t_overhead = timing_overhead(enough); + printf("TIMING_OVERHEAD=%f\n", t_overhead); fflush(stdout); + l_overhead = loop_overhead(enough, t_overhead); + printf("LOOP_OVERHEAD=%f\n", l_overhead); + printf("# version [%s]\n", revision); + exit(0); +} diff --git a/performance/lmbench3/src/disk.c b/performance/lmbench3/src/disk.c new file mode 100644 index 0000000..c3f1154 --- /dev/null +++ b/performance/lmbench3/src/disk.c @@ -0,0 +1,310 @@ +/* + * disk - calculate zone bandwidths and seek times + * + * Usage: disk device + * + * Copyright (c) 1994-1997 Larry McVoy. All rights reserved. + * Bits of this are derived from work by Ethan Solomita. + */ + +#include <stdio.h> +#include <sys/types.h> +#include <unistd.h> +#include <stdlib.h> +#include "bench.h" +#include "flushdisk.c" + +#ifndef sgi +#define NO_LSEEK64 +#define off64_t long long +#endif +#define SEEKPOINTS 2000 +#define ZONEPOINTS 150 + +uint64 disksize(char *); +int seekto(int, uint64); +int zone(char *disk, int oflag, int bsize); +int seek(char *disk, int oflag); + +int +main(int ac, char **av) +{ + fprintf(stderr, "\"Seek times for %s\n", av[1]); + seek(av[1], 0); + fprintf(stderr, "\n"); + fprintf(stderr, "\"Zone bandwidth for %s\n", av[1]); + zone(av[1], 0, (1<<20)); + return (0); +} + +int +zone(char *disk, int oflag, int bsize) +{ + char *buf; + int usecs; + int error; + int n; + int fd; + uint64 off; + int stride; + + if ((fd = open(disk, oflag)) == -1) { + perror(disk); + exit(1); + } + buf = valloc(bsize); + if (!buf) { + perror("valloc"); + exit(1); + } + bzero(buf, bsize); +#ifdef linux + flushdisk(fd); +#endif + + /* + * We want ZONEPOINTS data points + * but the stride has to be at least 512 and a 512 multiple. + * Weird code below for precision. + */ + off = disksize(disk); + off /= ZONEPOINTS; + stride = off; + if (stride < 512) stride = 512; + stride += 511; + stride >>= 9; + stride <<= 9; + + /* + * Very small disks such as ZIP drives get a 256K blocksize. + * As measured on my SCSI ZIP, there seems to be no + * difference between 256K and 1MB for sequential reads. + * XXX - there is a rotational delay difference but that's tough. + */ + if (bsize > stride) bsize = 256<<10; + if (bsize > stride) stride = bsize; + + off *= ZONEPOINTS; + debug((stdout, "stride=%d bs=%d size=%dM points=%d\n", + stride, bsize, (int)(off >> 20), (int)(off/stride))); + + /* + * Read buf's worth of data every stride and time it. + * Don't include the rotational delay. + * This first I/O outside the loop is to catch read/write permissions. + */ + +#define IO(a,b,c) (oflag == 0 ? (n = read(a,b,c)) : (n = write(a,b,c))) + + error = IO(fd, buf, 512); + if (error == -1) { + perror(disk); + exit(1); + } + off = 512; + for ( ;; ) { + if (IO(fd, buf, 1024) != 1024) { + exit(0); + } + off += 1024; + start(0); + if (IO(fd, buf, bsize) != bsize) { + exit(0); + } + usecs = stop(0, 0); + off += bsize; + fprintf(stderr, "%.01f %.2f\n", + off/1000000.0, (double)bsize/usecs); + off += stride; + if (seekto(fd, off)) { + exit(0); + } + } + exit(0); +} + +/* + * Seek - calculate seeks as a function of distance. + */ +#undef IO +#define IO(a,b,c) error = (oflag == 0 ? read(a,b,c) : write(a,b,c)); \ + if (error == -1) { perror("io"); exit(1); } +#define IOSIZE 512 +#define TOOSMALL 1000 /* seeks this small are cached */ +#define TOOBIG 1000000 /* seeks this big are remapped or weirdos */ + /* zip drives have seeks this long */ + +int +seek(char *disk, int oflag) +{ + char *buf; + int fd; + off64_t size; + off64_t begin, end; + int usecs; + int error; + int tot_msec = 0, tot_io = 0; + int stride; + + if ((fd = open(disk, oflag)) == -1) { + perror(disk); + return (-1); + } +#ifdef linux + flushdisk(fd); +#endif + size = disksize(disk); + buf = valloc(IOSIZE); + bzero(buf, IOSIZE); + + /* + * We flip back and forth, in strides of 1MB (typically). + * If we have a 100MB fd, that means we do + * 1, 99, 2, 98, etc. + * + * We want around SEEK POINTS data points + * but the stride has to be at least 512 and a 512 multiple. + */ + stride = size / SEEKPOINTS; + if (stride < 512) stride = 512; + stride += 511; + stride >>= 9; + stride <<= 9; + + debug((stdout, "stride=%d size=%dM points=%d\n", + stride, (int)(size >> 20), (int)(size/stride))); + + end = size; + begin = 0; + seekto(fd, begin); + IO(fd, buf, IOSIZE); + while (end >= begin + stride*2) { + end -= stride; + start(0); + seekto(fd, end); + IO(fd, buf, IOSIZE); + usecs = stop(0, 0); + if (usecs > TOOSMALL && usecs < TOOBIG) { + tot_io++; tot_msec += usecs/1000; + fprintf(stderr, "%.01f %.02f\n", + (end - begin - stride) / 1000000., usecs/1000.); + } + + begin += stride; + start(0); + seekto(fd, begin); + IO(fd, buf, IOSIZE); + usecs = stop(0, 0); + if (usecs > TOOSMALL && usecs < TOOBIG) { + tot_io++; tot_msec += usecs/1000; + fprintf(stderr, "%.01f %.02f\n", + (end + stride - begin) / 1000000., usecs/1000.); + } + } + /* + * This is wrong, it should take the 1/3 stroke seek average. + avg_msec = (double)tot_msec/tot_io; + fprintf(stderr, "Average time == %.04f\n", avg_msec); + */ + return (0); +} + +/* + * Calculate how big a device is. + * + * To avoid 32 bit problems, our units are MB. + */ +#define FORWARD (512<<20) +#define FORWARD1 (64<<20) +#define FORWARD2 (1<<20) + +/* + * Go forward in 1GB chunks until you can't. + * Go backwards in 128MB chunks until you can. + * Go forwards in 1MB chunks until you can't and return that -1. + */ +uint64 +disksize(char *disk) +{ + int fd = open(disk, 0); + char buf[512]; + uint64 off = 0; + + if (fd == -1) { + perror("usage: disksize device"); + return(0); + } + /* + * Go forward until it doesn't work. + */ + for ( ;; ) { + off += FORWARD; + if (seekto(fd, off)) { + debug((stdout, "seekto(%dM) failed\n", (int)(off>>20))); + off -= FORWARD; + break; + } + if ((read(fd, buf, sizeof(buf)) != sizeof(buf))) { + debug((stdout, "read @ %dM failed\n", (int)(off>>20))); + off -= FORWARD; + break; + } + } + + for ( ;; ) { + off += FORWARD1; + if (seekto(fd, off)) { + debug((stdout, "seekto(%dM) failed\n", (int)(off>>20))); + off -= FORWARD1; + break; + } + if ((read(fd, buf, sizeof(buf)) != sizeof(buf))) { + debug((stdout, "read @ %dM failed\n", (int)(off>>20))); + off -= FORWARD1; + break; + } + } + + for ( ;; ) { + off += FORWARD2; + if (seekto(fd, off)) { + debug((stdout, "seekto(%dM) failed\n", (int)(off>>20))); + off -= FORWARD2; + break; + } + if ((read(fd, buf, sizeof(buf)) != sizeof(buf))) { + debug((stdout, "read @ %dM failed\n", (int)(off>>20))); + off -= FORWARD2; + break; + } + } + + debug((stdout, "disksize(%s) = %d MB\n", disk, (int)(off >> 20))); + return (off); +} + +#define BIGSEEK (1<<30) + +int +seekto(int fd, uint64 off) +{ +#ifdef __linux__ + extern loff_t llseek(int, loff_t, int); + + if (llseek(fd, (loff_t)off, SEEK_SET) == (loff_t)-1) { + return(-1); + } + return (0); +#else + uint64 here = 0; + + lseek(fd, 0, 0); + while ((uint64)(off - here) > (uint64)BIGSEEK) { + if (lseek(fd, BIGSEEK, SEEK_CUR) == -1) break; + here += BIGSEEK; + } + assert((uint64)(off - here) <= (uint64)BIGSEEK); + if (lseek(fd, (int)(off - here), SEEK_CUR) == -1) return (-1); + return (0); +#endif +} diff --git a/performance/lmbench3/src/enough.c b/performance/lmbench3/src/enough.c new file mode 100644 index 0000000..6128ccf --- /dev/null +++ b/performance/lmbench3/src/enough.c @@ -0,0 +1,13 @@ +#include <stdio.h> +#include <stdlib.h> + +extern int get_enough(int); + +int +main() +{ + putenv("LOOP_O=0.0"); + putenv("TIMING_O=0.0"); + printf("%u\n", get_enough(0)); + return (0); +} diff --git a/performance/lmbench3/src/flushdisk.c b/performance/lmbench3/src/flushdisk.c new file mode 100644 index 0000000..0c422ed --- /dev/null +++ b/performance/lmbench3/src/flushdisk.c @@ -0,0 +1,42 @@ +#ifdef linux +/* + * flushdisk() - linux block cache clearing + */ + +#include <stdio.h> +#include <sys/types.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include <sys/mount.h> + +int +flushdisk(int fd) +{ + int ret = ioctl(fd, BLKFLSBUF, 0); + usleep(100000); + return (ret); +} + +#endif + +#ifdef MAIN +int +main(int ac, char **av) +{ +#ifdef linux + int fd; + int i; + + for (i = 1; i < ac; ++i) { + fd = open(av[i], 0); + if (flushdisk(fd)) { + exit(1); + } + close(fd); + } +#endif + exit(0); +} +#endif diff --git a/performance/lmbench3/src/getopt.c b/performance/lmbench3/src/getopt.c new file mode 100644 index 0000000..a868959 --- /dev/null +++ b/performance/lmbench3/src/getopt.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 1997 L.W.McVoy + * + * SGI's fucking getopt doesn't follow GNU's reset policy. Isn't having + * N versions of Unix a great thing for the world? I'm gonna move to NT + * if these assholes don't get their act together. + * + * This version handles + * + * - (leaves it and returns) + * -a + * -abcd + * -r <arg> + * -r<arg> + * -abcr <arg> + * -abcr<arg> + * -r<arg> -R<arg>, etc. + * + * A special form is "d|" instead of "d:". This means the arg has to be + * right next to the option. + * Another special form is "d;". This means the option must be right next + * to the option letter and can not be blank. + */ +#include "bench.h" +static char *id = "%@%"; + +int optopt; /* option that is in error, if we return an error */ +int optind; /* next arg in argv we process */ +char *optarg; /* argument to an option */ +static int n; + +int +getopt(int ac, char **av, char *opts) +{ + char *t; + + if (!optind) { + optind = 1; + n = 1; + } + debug((stderr, "GETOPT ind=%d n=%d arg=%s av[%d]='%s'\n", + optind, n, optarg ? optarg : "", optind, av[optind])); + + if ((optind >= ac) || (av[optind][0] != '-') || !av[optind][1]) { + return (EOF); + } + + assert(av[optind][n]); + for (t = (char *)opts; *t; t++) { + if (*t == av[optind][n]) { + break; + } + } + if (!*t) { + optopt = av[optind][n]; + debug((stderr, "\tran out of option letters\n")); + return ('?'); + } + + /* OK, we found a legit option, let's see what to do with it. + * If it isn't one that takes an option, just advance and return. + */ + if (t[1] != ':' && t[1] != '|' && t[1] != ';') { + if (!av[optind][n+1]) { + optind++; + n = 1; + } else { + n++; + } + debug((stderr, "\tLegit singleton %c\n", *t)); + return (*t); + } + + /* got one with an option, see if it is cozied up to the flag */ + if (av[optind][n+1]) { + if (av[optind][n+1]) { + optarg = &av[optind][n+1]; + } else { + optarg = 0; + } + optind++; + n = 1; + debug((stderr, "\t%c with %s\n", *t, optarg)); + return (*t); + } + + /* If it was not there, and it is optional, OK */ + if (t[1] == '|') { + optarg = 0; + optind++; + n = 1; + debug((stderr, "\t%c without arg\n", *t)); + return (*t); + } + + /* was it supposed to be there? */ + if (t[1] == ';') { + optarg = 0; + optind++; + optopt = *t; + debug((stderr, "\twanted another word\n")); + return ('?'); + } + + /* Nope, there had better be another word. */ + if ((optind + 1 == ac) || (av[optind+1][0] == '-')) { + optopt = av[optind][n]; + debug((stderr, "\twanted another word\n")); + return ('?'); + } + optarg = av[optind+1]; + optind += 2; + n = 1; + debug((stderr, "\t%c with arg %s\n", *t, optarg)); + return (*t); +} + +#ifdef TEST + +/* XXX a.out -y file */ +main(int ac, char **av) +{ + extern char *optarg; + extern int optind; + char *comment = 0; + int c; + + while ((c = getopt(ac, av, "fnpsx:y|")) != -1) { + switch (c) { + case 'f': + case 'n': + case 'p': + case 's': + printf("Got option %c\n", c); + break; + case 'x': + case 'y': + comment = optarg; + printf("Got optarg %s with -%c\n", comment, c); + break; + case '?': + fprintf(stderr, "bad option %c\n", optopt); + break; + default: + fprintf(stderr, "unknown ret %c\n", c); + break; + } + } + while (av[optind]) { + printf("av[%d] = %s\n", optind, av[optind++]); + } + exit(0); +} +#endif diff --git a/performance/lmbench3/src/hello.c b/performance/lmbench3/src/hello.c new file mode 100644 index 0000000..15a2493 --- /dev/null +++ b/performance/lmbench3/src/hello.c @@ -0,0 +1,8 @@ +#include "bench.h" + +int +main() +{ + write(1, "Hello world\n", 12); + return (0); +} diff --git a/performance/lmbench3/src/lat_cmd.c b/performance/lmbench3/src/lat_cmd.c new file mode 100644 index 0000000..412a4d2 --- /dev/null +++ b/performance/lmbench3/src/lat_cmd.c @@ -0,0 +1,100 @@ +/* + * lat_cmd.c - time to complete a given command line + * + * usage: lat_cmd [-P <parallelism>] [-W <warmup>] [-N <repetitions>] cmd... + * + * Copyright (c) 2004 Carl Staelin. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +void bench(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); + +typedef struct _state { + char** argv; + pid_t pid; +} state_t; + +int +main(int ac, char **av) +{ + int c; + int i; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + char buf[1024]; + state_t state; + char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] cmdline...\n"; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind >= ac) { + lmbench_usage(ac, av, usage); + } + state.argv = (char**)malloc((ac - optind + 1) * sizeof(char*)); + state.pid = 0; + for (i = 0; i < ac - optind; ++i) { + state.argv[i] = av[optind + i]; + } + state.argv[i] = NULL; + + benchmp(NULL, bench, NULL, 0, parallel, warmup, repetitions, &state); + micro("lat_cmd", get_n()); + return (0); +} + +void +cleanup(iter_t iterations, void* cookie) +{ + state_t* state = (state_t*)cookie; + + if (iterations) return; + + if (state->pid) { + kill(state->pid, SIGKILL); + waitpid(state->pid, NULL, 0); + state->pid = 0; + } +} + +void +bench(register iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + + signal(SIGCHLD, SIG_DFL); + while (iterations-- > 0) { + switch (state->pid = fork()) { + case '0': + execvp(state->argv[0], state->argv); + /*NOTREACHED*/ + default: + break; + } + waitpid(state->pid, NULL, 0); + state->pid = 0; + } +} + diff --git a/performance/lmbench3/src/lat_connect.c b/performance/lmbench3/src/lat_connect.c new file mode 100644 index 0000000..6639cca --- /dev/null +++ b/performance/lmbench3/src/lat_connect.c @@ -0,0 +1,110 @@ +/* + * lat_connect.c - simple TCP connection latency test + * + * Three programs in one - + * server usage: lat_connect -s + * client usage: lat_connect [-N <repetitions>] hostname + * shutdown: lat_connect -hostname + * + * lat_connect may not be parallelized because of idiosyncracies + * with TCP connection creation. Basically, if the client tries + * to create too many connections too quickly, the system fills + * up the set of available connections with TIME_WAIT connections. + * We can only measure the TCP connection cost accurately if we + * do just a few connections. Since the parallel harness needs + * each child to run for a second, this guarantees that the + * parallel version will generate inaccurate results. + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; +#include "bench.h" + +typedef struct _state { + char *server; +} state_t; + +void doclient(iter_t iterations, void * cookie); +void server_main(); + +int +main(int ac, char **av) +{ + state_t state; + int repetitions = TRIES; + int c; + char buf[256]; + char *usage = "-s\n OR [-S] [-N <repetitions>] server\n"; + + while (( c = getopt(ac, av, "sSP:W:N:")) != EOF) { + switch(c) { + case 's': /* Server */ + if (fork() == 0) { + server_main(); + } + exit(0); + case 'S': /* shutdown serverhost */ + { + int sock = tcp_connect(av[optind], + TCP_CONNECT, + SOCKOPT_NONE); + write(sock, "0", 1); + close(sock); + exit(0); + } + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind + 1 != ac) { + lmbench_usage(ac, av, usage); + } + + state.server = av[optind]; + benchmp(NULL, doclient, NULL, 0, 1, 0, repetitions, &state); + + sprintf(buf, "TCP/IP connection cost to %s", state.server); + micro(buf, get_n()); + exit(0); +} + +void +doclient(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register char *server = state->server; + register int sock; + + while (iterations-- > 0) { + sock = tcp_connect(server, TCP_CONNECT, SOCKOPT_REUSE); + close(sock); + } +} + +void +server_main() +{ + int newsock, sock; + char c ='1'; + + GO_AWAY; + sock = tcp_server(TCP_CONNECT, SOCKOPT_NONE|SOCKOPT_REUSE); + for (;;) { + newsock = tcp_accept(sock, SOCKOPT_NONE); + if (read(newsock, &c, 1) > 0) { + tcp_done(TCP_CONNECT); + exit(0); + } + close(newsock); + } + /* NOTREACHED */ +} diff --git a/performance/lmbench3/src/lat_ctx.c b/performance/lmbench3/src/lat_ctx.c new file mode 100644 index 0000000..4c81af8 --- /dev/null +++ b/performance/lmbench3/src/lat_ctx.c @@ -0,0 +1,350 @@ +/* + * lat_ctx.c - context switch timer + * + * usage: lat_ctx [-P parallelism] [-W <warmup>] [-N <repetitions>] [-s size] #procs [#procs....] + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + + +#define MAXPROC 2048 +#define CHUNK (4<<10) +#define TRIPS 5 +#ifndef max +#define max(a, b) ((a) > (b) ? (a) : (b)) +#endif + +void doit(int rd, int wr, int process_size); +int create_pipes(int **p, int procs); +int create_daemons(int **p, pid_t *pids, int procs, int process_size); +void initialize_overhead(iter_t iterations, void* cookie); +void cleanup_overhead(iter_t iterations, void* cookie); +void benchmark_overhead(iter_t iterations, void* cookie); +void initialize(iter_t iterations, void* cookie); +void cleanup(iter_t iterations, void* cookie); +void benchmark(iter_t iterations, void* cookie); + +struct _state { + int process_size; + double overhead; + int procs; + pid_t* pids; + int **p; + void* data; +}; + +int +main(int ac, char **av) +{ + int i, maxprocs; + int c; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + struct _state state; + char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] [-s kbytes] processes [processes ...]\n"; + double time; + + /* + * Need 4 byte ints. + */ + if (sizeof(int) != 4) { + fprintf(stderr, "Fix sumit() in ctx.c.\n"); + exit(1); + } + + state.process_size = 0; + state.overhead = 0.0; + state.pids = NULL; + + /* + * If they specified a context size, or parallelism level, get them. + */ + while (( c = getopt(ac, av, "s:P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + case 's': + state.process_size = atoi(optarg) * 1024; + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind > ac - 1) + lmbench_usage(ac, av, usage); + + /* compute pipe + sumit overhead */ + maxprocs = atoi(av[optind]); + for (i = optind; i < ac; ++i) { + state.procs = atoi(av[i]); + if (state.procs > maxprocs) + maxprocs = state.procs; + } + state.procs = maxprocs; + benchmp(initialize_overhead, benchmark_overhead, cleanup_overhead, + 0, 1, warmup, repetitions, &state); + if (gettime() == 0) return(0); + state.overhead = gettime(); + state.overhead /= get_n(); + fprintf(stderr, "\n\"size=%dk ovr=%.2f\n", + state.process_size/1024, state.overhead); + + /* compute the context switch cost for N processes */ + for (i = optind; i < ac; ++i) { + state.procs = atoi(av[i]); + benchmp(initialize, benchmark, cleanup, 0, parallel, + warmup, repetitions, &state); + + time = gettime(); + time /= get_n(); + time /= state.procs; + time -= state.overhead; + + if (time > 0.0) + fprintf(stderr, "%d %.2f\n", state.procs, time); + } + + return (0); +} + +void +initialize_overhead(iter_t iterations, void* cookie) +{ + int i; + int procs; + int* p; + struct _state* pState = (struct _state*)cookie; + + if (iterations) return; + + pState->pids = NULL; + pState->p = (int**)malloc(pState->procs * (sizeof(int*) + 2 * sizeof(int))); + p = (int*)&pState->p[pState->procs]; + for (i = 0; i < pState->procs; ++i) { + pState->p[i] = p; + p += 2; + } + + pState->data = (pState->process_size > 0) ? malloc(pState->process_size) : NULL; + if (pState->data) + bzero(pState->data, pState->process_size); + + procs = create_pipes(pState->p, pState->procs); + if (procs < pState->procs) { + cleanup_overhead(0, cookie); + exit(1); + } +} + +void +cleanup_overhead(iter_t iterations, void* cookie) +{ + int i; + struct _state* pState = (struct _state*)cookie; + + if (iterations) return; + + for (i = 0; i < pState->procs; ++i) { + close(pState->p[i][0]); + close(pState->p[i][1]); + } + + free(pState->p); + if (pState->data) free(pState->data); +} + +void +benchmark_overhead(iter_t iterations, void* cookie) +{ + struct _state* pState = (struct _state*)cookie; + int i = 0; + int msg = 1; + + while (iterations-- > 0) { + if (write(pState->p[i][1], &msg, sizeof(msg)) != sizeof(msg)) { + /* perror("read/write on pipe"); */ + exit(1); + } + if (read(pState->p[i][0], &msg, sizeof(msg)) != sizeof(msg)) { + /* perror("read/write on pipe"); */ + exit(1); + } + if (++i == pState->procs) { + i = 0; + } + bread(pState->data, pState->process_size); + } +} + +void +initialize(iter_t iterations, void* cookie) +{ + int procs; + struct _state* pState = (struct _state*)cookie; + + if (iterations) return; + + initialize_overhead(iterations, cookie); + + pState->pids = (pid_t*)malloc(pState->procs * sizeof(pid_t)); + if (pState->pids == NULL) + exit(1); + bzero((void*)pState->pids, pState->procs * sizeof(pid_t)); + procs = create_daemons(pState->p, pState->pids, + pState->procs, pState->process_size); + if (procs < pState->procs) { + cleanup(0, cookie); + exit(1); + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + int i; + struct _state* pState = (struct _state*)cookie; + + if (iterations) return; + + /* + * Close the pipes and kill the children. + */ + cleanup_overhead(iterations, cookie); + for (i = 1; pState->pids && i < pState->procs; ++i) { + if (pState->pids[i] > 0) { + kill(pState->pids[i], SIGKILL); + waitpid(pState->pids[i], NULL, 0); + } + } + if (pState->pids) + free(pState->pids); + pState->pids = NULL; +} + +void +benchmark(iter_t iterations, void* cookie) +{ + struct _state* pState = (struct _state*)cookie; + int msg; + + /* + * Main process - all others should be ready to roll, time the + * loop. + */ + while (iterations-- > 0) { + if (write(pState->p[0][1], &msg, sizeof(msg)) != + sizeof(msg)) { + /* perror("read/write on pipe"); */ + exit(1); + } + if (read(pState->p[pState->procs-1][0], &msg, sizeof(msg)) != sizeof(msg)) { + /* perror("read/write on pipe"); */ + exit(1); + } + bread(pState->data, pState->process_size); + } +} + + +void +doit(int rd, int wr, int process_size) +{ + int msg; + void* data = NULL; + + if (process_size) { + data = malloc(process_size); + if (data) bzero(data, process_size); + } + for ( ;; ) { + if (read(rd, &msg, sizeof(msg)) != sizeof(msg)) { + /* perror("read/write on pipe"); */ + break; + } + bread(data, process_size); + if (write(wr, &msg, sizeof(msg)) != sizeof(msg)) { + /* perror("read/write on pipe"); */ + break; + } + } + exit(1); +} + + +int +create_daemons(int **p, pid_t *pids, int procs, int process_size) +{ + int i, j; + int msg; + + /* + * Use the pipes as a ring, and fork off a bunch of processes + * to pass the byte through their part of the ring. + * + * Do the sum in each process and get that time before moving on. + */ + handle_scheduler(benchmp_childid(), 0, procs-1); + for (i = 1; i < procs; ++i) { + switch (pids[i] = fork()) { + case -1: /* could not fork, out of processes? */ + return i; + + case 0: /* child */ + handle_scheduler(benchmp_childid(), i, procs-1); + for (j = 0; j < procs; ++j) { + if (j != i - 1) close(p[j][0]); + if (j != i) close(p[j][1]); + } + doit(p[i-1][0], p[i][1], process_size); + /* NOTREACHED */ + + default: /* parent */ + ; + } + } + + /* + * Go once around the loop to make sure that everyone is ready and + * to get the token in the pipeline. + */ + if (write(p[0][1], &msg, sizeof(msg)) != sizeof(msg) || + read(p[procs-1][0], &msg, sizeof(msg)) != sizeof(msg)) { + /* perror("write/read/write on pipe"); */ + exit(1); + } + return procs; +} + +int +create_pipes(int **p, int procs) +{ + int i; + /* + * Get a bunch of pipes. + */ + morefds(); + for (i = 0; i < procs; ++i) { + if (pipe(p[i]) == -1) { + return i; + } + } + return procs; +} diff --git a/performance/lmbench3/src/lat_dram_page.c b/performance/lmbench3/src/lat_dram_page.c new file mode 100644 index 0000000..250af78 --- /dev/null +++ b/performance/lmbench3/src/lat_dram_page.c @@ -0,0 +1,201 @@ +/* + * lat_dram_page.c - guess the DRAM page latency + * + * usage: lat_dram_page + * + * Copyright (c) 2002 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +void dram_page_initialize(iter_t iterations, void* cookie); +void benchmark_loads(iter_t iterations, void *cookie); +double loads(benchmp_f initialize, int len, int warmup, int repetitions, void* cookie); + +struct dram_page_state +{ + struct mem_state mstate; + int group; +}; + +int +main(int ac, char **av) +{ + int i, j, l; + int verbose = 0; + int maxlen = 64 * 1024 * 1024; + int warmup = 0; + int repetitions = TRIES; + int c; + struct dram_page_state state; + double dram_hit, dram_miss; + char *usage = "[-v] [-W <warmup>] [-N <repetitions>][-M len[K|M]]\n"; + + state.mstate.width = 1; + state.mstate.line = sizeof(char*); + state.mstate.pagesize = getpagesize(); + state.group = 16; + + while (( c = getopt(ac, av, "avL:T:M:W:N:")) != EOF) { + switch(c) { + case 'v': + verbose = 1; + break; + case 'L': + state.mstate.line = bytes(optarg); + break; + case 'T': + state.group = bytes(optarg); + break; + case 'M': + maxlen = bytes(optarg); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + dram_hit = loads(mem_initialize, maxlen, warmup, repetitions, &state); + dram_miss = loads(dram_page_initialize, maxlen, warmup, repetitions, &state); + + if (dram_hit < 0.95 * dram_miss) { + fprintf(stderr, "%f\n", dram_miss - dram_hit); + } else { + fprintf(stderr, "0.0\n"); + } + + return (0); +} + +#define ONE p = (char **)*p; +#define FIVE ONE ONE ONE ONE ONE +#define TEN FIVE FIVE +#define FIFTY TEN TEN TEN TEN TEN +#define HUNDRED FIFTY FIFTY + +void +benchmark_loads(iter_t iterations, void *cookie) +{ + struct mem_state* state = (struct mem_state*)cookie; + register char **p = (char**)state->base; + register int i; + register int count = state->len / (state->line * 100) + 1; + + while (iterations-- > 0) { + for (i = 0; i < count; ++i) { + HUNDRED; + } + } + + use_pointer((void *)p); +} + +void +regroup(size_t* pages, int groupsize, void* cookie) +{ + register int i, j; + register char* ptr; + register char *page; + register char *page_end; + register char *p = 0 /* lint */; + struct mem_state* state = (struct mem_state*)cookie; + + if (groupsize <= 1) return; + + p = state->base; + + /* + * for all but the last page in the group, + * point to the same line in the next page + */ + for (i = 0; i < groupsize - 1; ++i) { + for (j = 0; j < state->pagesize; j += sizeof(char*)) { + *(char**)(p + pages[i] + j) = p + pages[i+1] + j; + } + } + + /* + * for the last page, point to the next line + * in the first page of the group, except for + * the last line in the page which points to + * the first line in the next group + * + * since the pointers are all set up for the + * last line, only modify the pointers for + * the other lines + */ + page = p + pages[groupsize-1]; + page_end = page + state->pagesize; + for (i = 0; i < state->pagesize; i += sizeof(char*)) { + ptr = *(char**)(page + i); + if (page <= ptr && ptr < page_end) { + int offset = (int)(ptr - page); + *(char**)(page + i) = p + pages[0] + offset; + } + } +} + +/* + * This is like mem_initialize + */ +void +dram_page_initialize(iter_t iterations, void* cookie) +{ + int i; + struct mem_state* state = (struct mem_state*)cookie; + struct dram_page_state* dstate = (struct dram_page_state*)cookie; + + if (iterations) return; + + mem_initialize(iterations, cookie); + + for (i = 0; i < state->npages; i += dstate->group) { + int groupsize = dstate->group; + if (groupsize > state->npages - i) { + groupsize = state->npages - i; + } + regroup(state->pages + i, groupsize, cookie); + } + + benchmark_loads(1, cookie); +} + +double +loads(benchmp_f initialize, int len, int warmup, int repetitions, void* cookie) +{ + double result; + int count; + int parallel = 1; + struct mem_state* state = (struct mem_state*)cookie; + + state->len = len; + state->maxlen = len; + count = 100 * (state->len / (state->line * 100) + 1); + + /* + * Now walk them and time it. + */ + benchmp(initialize, benchmark_loads, mem_cleanup, + 0, parallel, warmup, repetitions, cookie); + + /* We want to get to nanoseconds / load. */ + result = (1000. * (double)gettime()) / (double)(count * get_n()); + /* + fprintf(stderr, "%.5f %.3f\n", len / (1024. * 1024.), result); + /**/ + + return result; +} diff --git a/performance/lmbench3/src/lat_fcntl.c b/performance/lmbench3/src/lat_fcntl.c new file mode 100644 index 0000000..bfe9e7f --- /dev/null +++ b/performance/lmbench3/src/lat_fcntl.c @@ -0,0 +1,224 @@ +#include "bench.h" + +/* + * lat_fcntl.c - file locking test + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id: lat_pipe.c,v 1.8 1997/06/16 05:38:58 lm Exp $\n"; + +#include "bench.h" + +struct flock lock, unlock; +struct flock s1, s2; + +/* + * Create two files, use them as a ping pong test. + * Process A: + * lock(1) + * unlock(2) + * Process B: + * unlock(1) + * lock(2) + * Initial state: + * lock is locked + * lock2 is locked + */ + +#define waiton(fd) fcntl(fd, F_SETLKW, &lock) +#define release(fd) fcntl(fd, F_SETLK, &unlock) + +struct _state { + char filename1[2048]; + char filename2[2048]; + int pid; + int fd1; + int fd2; +}; + +void initialize(iter_t iterations, void* cookie); +void benchmark(iter_t iterations, void* cookie); +void cleanup(iter_t iterations, void* cookie); + +void +procA(struct _state *state) +{ + if (waiton(state->fd1) == -1) { + perror("lock of fd1 failed\n"); + cleanup(0, state); + exit(1); + } + if (release(state->fd2) == -1) { + perror("unlock of fd2 failed\n"); + cleanup(0, state); + exit(1); + } + if (waiton(state->fd2) == -1) { + perror("lock of fd2 failed\n"); + cleanup(0, state); + exit(1); + } + if (release(state->fd1) == -1) { + perror("unlock of fd1 failed\n"); + cleanup(0, state); + exit(1); + } +} + +void +procB(struct _state *state) +{ + if (release(state->fd1) == -1) { + perror("unlock of fd1 failed\n"); + cleanup(0, state); + exit(1); + } + if (waiton(state->fd2) == -1) { + perror("lock of fd2 failed\n"); + cleanup(0, state); + exit(1); + } + if (release(state->fd2) == -1) { + perror("unlock of fd2 failed\n"); + cleanup(0, state); + exit(1); + } + if (waiton(state->fd1) == -1) { + perror("lock of fd1 failed\n"); + cleanup(0, state); + exit(1); + } +} + +void +initialize(iter_t iterations, void* cookie) +{ + char buf[10000]; + struct _state* state = (struct _state*)cookie; + + if (iterations) return; + + sprintf(state->filename1, "/tmp/lmbench-fcntl%d.1", getpid()); + sprintf(state->filename2, "/tmp/lmbench-fcntl%d.2", getpid()); + state->pid = 0; + state->fd1 = -1; + state->fd2 = -1; + + unlink(state->filename1); + unlink(state->filename2); + if ((state->fd1 = open(state->filename1, O_CREAT|O_RDWR, 0666)) == -1) { + perror("create"); + exit(1); + } + if ((state->fd2 = open(state->filename2, O_CREAT|O_RDWR, 0666)) == -1) { + perror("create"); + exit(1); + } + unlink(state->filename1); + unlink(state->filename2); + write(state->fd1, buf, sizeof(buf)); + write(state->fd2, buf, sizeof(buf)); + lock.l_type = F_WRLCK; + lock.l_whence = 0; + lock.l_start = 0; + lock.l_len = 1; + unlock = lock; + unlock.l_type = F_UNLCK; + if (waiton(state->fd1) == -1) { + perror("lock1"); + exit(1); + } + if (waiton(state->fd2) == -1) { + perror("lock2"); + exit(1); + } + handle_scheduler(benchmp_childid(), 0, 1); + switch (state->pid = fork()) { + case -1: + perror("fork"); + exit(1); + case 0: + handle_scheduler(benchmp_childid(), 1, 1); + for ( ;; ) { + procB(state); + } + exit(0); + default: + break; + } +} + +void +benchmark(iter_t iterations, void* cookie) +{ + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + procA(state); + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + int i; + struct _state* state = (struct _state*)cookie; + + if (iterations) return; + + if (state->fd1 >= 0) close(state->fd1); + if (state->fd2 >= 0) close(state->fd2); + state->fd1 = -1; + state->fd2 = -1; + + if (state->pid) { + kill(state->pid, SIGKILL); + waitpid(state->pid, NULL, 0); + } + state->pid = 0; +} + +int +main(int ac, char **av) +{ + int i; + int c; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + struct _state state; + char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; + + /* + * If they specified a parallelism level, get it. + */ + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + state.pid = 0; + + benchmp(initialize, benchmark, cleanup, 0, parallel, + warmup, repetitions, &state); + micro("Fcntl lock latency", 2 * get_n()); + + return (0); +} diff --git a/performance/lmbench3/src/lat_fifo.c b/performance/lmbench3/src/lat_fifo.c new file mode 100644 index 0000000..e3f69c4 --- /dev/null +++ b/performance/lmbench3/src/lat_fifo.c @@ -0,0 +1,165 @@ +/* + * lat_fifo.c - named pipe transaction test + * + * usage: lat_fifo [-P <parallelism>] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +#define F1 "/tmp/lmbench_f1.%d" +#define F2 "/tmp/lmbench_f2.%d" + +void initialize(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); +void doit(iter_t iterations, void *cookie); +void writer(int wr, int rd); + +typedef struct _state { + char filename1[256]; + char filename2[256]; + int pid; + int wr; + int rd; +} state_t; + +int +main(int ac, char **av) +{ + state_t state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind < ac) { + lmbench_usage(ac, av, usage); + } + + state.pid = 0; + + benchmp(initialize, doit, cleanup, SHORT, parallel, + warmup, repetitions, &state); + micro("Fifo latency", get_n()); + return (0); +} + +void +initialize(iter_t iterations, void *cookie) +{ + char c; + state_t * state = (state_t *)cookie; + + if (iterations) return; + + state->pid = 0; + sprintf(state->filename1,F1,getpid()); + sprintf(state->filename2,F2,getpid()); + + unlink(state->filename1); unlink(state->filename2); + if (mknod(state->filename1, S_IFIFO|0664, 0) || + mknod(state->filename2, S_IFIFO|0664, 0)) { + perror("mknod"); + exit(1); + } + handle_scheduler(benchmp_childid(), 0, 1); + switch (state->pid = fork()) { + case 0: + handle_scheduler(benchmp_childid(), 1, 1); + state->rd = open(state->filename1, O_RDONLY); + state->wr = open(state->filename2, O_WRONLY); + writer(state->wr, state->rd); + return; + + case -1: + perror("fork"); + return; + + default: + state->wr = open(state->filename1, O_WRONLY); + state->rd = open(state->filename2, O_RDONLY); + break; + } + + /* + * One time around to make sure both processes are started. + */ + if (write(state->wr, &c, 1) != 1 || read(state->rd, &c, 1) != 1) { + perror("(i) read/write on pipe"); + exit(1); + } +} + +void +cleanup(iter_t iterations, void * cookie) +{ + state_t * state = (state_t *)cookie; + + if (iterations) return; + + unlink(state->filename1); + unlink(state->filename2); + close(state->wr); + close(state->rd); + + if (state->pid > 0) { + kill(state->pid, 15); + waitpid(state->pid, NULL, 0); + state->pid = 0; + } +} + +void +doit(register iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + char c; + register int w = state->wr; + register int r = state->rd; + register char *cptr = &c; + + while (iterations-- > 0) { + if (write(w, cptr, 1) != 1 || + read(r, cptr, 1) != 1) { + perror("(r) read/write on pipe"); + exit(1); + } + } +} + +void +writer(register int w, register int r) +{ + char c; + register char *cptr = &c; + + for ( ;; ) { + if (read(r, cptr, 1) != 1 || + write(w, cptr, 1) != 1) { + perror("(w) read/write on pipe"); + } + } +} diff --git a/performance/lmbench3/src/lat_fs.c b/performance/lmbench3/src/lat_fs.c new file mode 100644 index 0000000..0dfafb9 --- /dev/null +++ b/performance/lmbench3/src/lat_fs.c @@ -0,0 +1,272 @@ +/* + * Benchmark creates & deletes. + */ + +char *id = "$Id$\n"; + +#include "bench.h" + + +struct _state { + char *tmpdir; + int max; + int n; + char** names; + int ndirs; + char** dirs; + size_t size; +}; +void measure(size_t size, + int parallel, int warmup, int repetitions, void* cookie); +void mkfile(char* s, size_t size); +void setup_names(iter_t iterations, void* cookie); +void cleanup_names(iter_t iterations, void* cookie); +void setup_rm(iter_t iterations, void* cookie); +void cleanup_mk(iter_t iterations, void* cookie); +void benchmark_mk(iter_t iterations, void* cookie); +void benchmark_rm(iter_t iterations, void* cookie); + +int +main(int ac, char **av) +{ + int i; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + static int sizes[] = { 0, 1024, 4096, 10*1024 }; + struct _state state; + int c; + char* usage = "[-s <file size>] [-n <max files per dir>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] [<dir>]\n"; + + state.size = 0; + state.max = 100; + state.tmpdir = NULL; + + while (( c = getopt(ac, av, "s:n:P:W:N:")) != EOF) { + switch(c) { + case 's': + state.size = bytes(optarg); + break; + case 'n': + state.max = bytes(optarg); + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind < ac - 1) { + lmbench_usage(ac, av, usage); + } + if (optind == ac - 1) { + state.tmpdir = av[1]; + } + + if (state.size) { + measure(state.size, parallel, warmup, repetitions, &state); + } else { + for (i = 0; i < sizeof(sizes)/sizeof(int); ++i) { + state.size = sizes[i]; + measure(state.size, + parallel, warmup, repetitions, &state); + } + } + return(0); +} + +void +measure(size_t size, int parallel, int warmup, int repetitions, void* cookie) +{ + fprintf(stderr, "%luk", size>>10); + benchmp(setup_names, benchmark_mk, cleanup_mk, 0, parallel, + warmup, repetitions, cookie); + if (gettime()) { + fprintf(stderr, "\t%lu\t%.0f", (unsigned long)get_n(), + (double)(1000000. * get_n() / (double)gettime())); + } else { + fprintf(stderr, "\t-1\t-1"); + } + + benchmp(setup_rm, benchmark_rm, cleanup_names, 0, parallel, + warmup, repetitions, cookie); + if (gettime()) { + fprintf(stderr, "\t%.0f", + (double)(1000000. * get_n() / (double)gettime())); + } else { + fprintf(stderr, "\t-1"); + } + fprintf(stderr, "\n"); +} + +void +mkfile(char *name, size_t size) +{ + size_t chunk; + int fd = creat(name, 0666); + char buf[128*1024]; /* XXX - track sizes */ + + while (size > 0) { + chunk = ((size > (128*1024)) ? (128*1024) : size); + write(fd, buf, chunk); + size -= chunk; + } + close(fd); +} + +void +setup_names_recurse(iter_t* foff, iter_t* doff, int depth, struct _state* state) +{ + long i, ndirs, count; + char* basename = state->dirs[*doff]; + char name[L_tmpnam + 8192]; + + if (depth > 0) { + for (count = state->max, i = 1; i < depth; ++i) { + count *= state->max; + } + ndirs = (state->n - *foff) / count + 1; + for (i = 0; i < state->max && i < ndirs && *foff < state->n; ++i) { + sprintf(name, "%s/%ld", basename, i); + state->dirs[++(*doff)] = strdup(name); + mkdir(name, 0777); + setup_names_recurse(foff, doff, depth-1, state); + } + } else { + for (i = 0; i < state->max && *foff < state->n; ++i) { + sprintf(name, "%s/%ld", basename, i); + state->names[(*foff)++] = strdup(name); + } + } +} + +void +setup_names(iter_t iterations, void* cookie) +{ + long i, ndirs, depth; + iter_t foff; + iter_t doff; + char dirname_tmpl[L_tmpnam + 256]; + char* dirname; + struct _state* state = (struct _state*)cookie; + + if (!iterations) return; + + depth = 0; + state->n = iterations; + state->ndirs = iterations / state->max; + if (iterations % state->max) state->ndirs++; + for (ndirs = state->ndirs; ndirs > 1; ) { + ndirs = ndirs / state->max + ((ndirs % state->max) ? 1 : 0); + state->ndirs += ndirs; + depth++; + } + + state->names = (char**)malloc(iterations * sizeof(char*)); + for (i = 0; i < iterations; ++i) { + state->names[i] = NULL; + } + + state->dirs = (char**)malloc(state->ndirs * sizeof(char*)); + for (i = 0; i < state->ndirs; ++i) { + state->dirs[i] = NULL; + } + + sprintf(dirname_tmpl, "lat_fs_%d_XXXXXX", getpid()); + dirname = tempnam(state->tmpdir, dirname_tmpl); + if (!dirname) { + perror("tempnam failed"); + exit(1); + } + if (mkdir(dirname, S_IRUSR|S_IWUSR|S_IXUSR)) { + perror("mkdir failed"); + exit(1); + } + state->dirs[0] = dirname; + foff = 0; + doff = 0; + setup_names_recurse(&foff, &doff, depth, state); + if (foff != iterations || doff != state->ndirs - 1) { + fprintf(stderr, "setup_names: ERROR: foff=%lu, iterations=%lu, doff=%lu, ndirs=%lu, depth=%d\n", (unsigned long)foff, (unsigned long)iterations, (unsigned long)doff, (unsigned long)state->ndirs, depth); + } +} + +void +cleanup_names(iter_t iterations, void* cookie) +{ + long i; + struct _state* state = (struct _state*)cookie; + + if (!iterations) return; + + for (i = 0; i < state->n; ++i) { + if (state->names[i]) free(state->names[i]); + } + free(state->names); + state->n = 0; + + for (i = state->ndirs - 1; i >= 0; --i) { + if (state->dirs[i]) { + rmdir(state->dirs[i]); + free(state->dirs[i]); + } + } + free(state->dirs); + state->ndirs = 0; +} + +void +setup_rm(iter_t iterations, void* cookie) +{ + if (!iterations) return; + + setup_names(iterations, cookie); + benchmark_mk(iterations, cookie); +} + +void +cleanup_mk(iter_t iterations, void* cookie) +{ + if (!iterations) return; + + benchmark_rm(iterations, cookie); + cleanup_names(iterations, cookie); +} + +void +benchmark_mk(iter_t iterations, void* cookie) +{ + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + if (!state->names[iterations]) { + fprintf(stderr, "benchmark_mk: null filename at %lu of %lu\n", iterations, state->n); + continue; + } + mkfile(state->names[iterations], state->size); + } +} + +void +benchmark_rm(iter_t iterations, void* cookie) +{ + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + if (!state->names[iterations]) { + fprintf(stderr, "benchmark_rm: null filename at %lu of %lu\n", iterations, state->n); + continue; + } + unlink(state->names[iterations]); + } +} + diff --git a/performance/lmbench3/src/lat_http.c b/performance/lmbench3/src/lat_http.c new file mode 100644 index 0000000..77e6f38 --- /dev/null +++ b/performance/lmbench3/src/lat_http.c @@ -0,0 +1,128 @@ +/* + * lat_http.c - simple HTTP transaction latency test + * + * usage: lat_http hostname [port] < filelist + * + * Copyright (c) 1994-6 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +char *buf; +int debug; +int echo; + +int +http(char *server, char *file, int prog) +{ + int sock; + int n; + int b = 0; + + sock = tcp_connect(server, prog, SOCKOPT_REUSE); + sprintf(buf, "GET /%s HTTP/1.0\r\n\r\n\n", file); + if (debug) { + printf(buf); + } + write(sock, buf, strlen(buf)); + while ((n = read(sock, buf, XFERSIZE)) > 0) { + b += n; + if (echo) { + write(1, buf, n); + } + } + close(sock); + if (debug) { + printf("Got %d\n", b); + } + return (b); +} + +void +killhttp(char *server, int prog) +{ + int sock; + + sock = tcp_connect(server, prog, SOCKOPT_REUSE); + write(sock, "EXIT", 4); + close(sock); +} + +void chop(register char *s) { while (*s && *s != '\n') s++; *s = 0; } + +int +main(int ac, char **av) +{ + char *server; + int i, prog; + int c; + int shutdown = 0; + uint64 total = 0; + uint64 usecs = 0; + double avg; + char *name = av[0]; + char file[1024]; + char *usage = "[-d] [-e] [-S] serverhost [port] < list\n"; + + while (( c = getopt(ac, av, "deS")) != EOF) { + switch(c) { + case 'd': + debug++; + break; + case 'e': + echo++; + break; + case 'S': /* shutdown serverhost */ + shutdown = 1; + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind >= ac || optind < ac - 2) { + lmbench_usage(ac, av, usage); + exit(0); + } + server = av[optind++]; + + if (optind < ac && atoi(av[optind]) != 0) { + prog = -atoi(av[optind]); + } else { + prog = -80; + } + + if (shutdown) { + killhttp(server, prog); + exit(0); + } + + i = 0; + buf = valloc(XFERSIZE); + bzero(buf, XFERSIZE); + while (fgets(file, sizeof(file), stdin)) { + chop(file); + start(0); + total += http(server, file, prog); + usecs += stop(0,0); + i++; + } + avg = total; + avg /= (i - 1); + if (avg > 1000) { + avg /= 1000; + fprintf(stderr, "Avg xfer: %.1fKB, ", avg); + } else { + fprintf(stderr, "Avg xfer %d, ", (int)avg); + } + settime(usecs); + latency((uint64)1, total); + exit(0); +} + diff --git a/performance/lmbench3/src/lat_mem_rd.c b/performance/lmbench3/src/lat_mem_rd.c new file mode 100644 index 0000000..e56e458 --- /dev/null +++ b/performance/lmbench3/src/lat_mem_rd.c @@ -0,0 +1,169 @@ +/* + * lat_mem_rd.c - measure memory load latency + * + * usage: lat_mem_rd [-P <parallelism>] [-W <warmup>] [-N <repetitions>] [-t] size-in-MB [stride ...] + * + * Copyright (c) 1994 Larry McVoy. + * Copyright (c) 2003, 2004 Carl Staelin. + * + * Distributed under the FSF GPL with additional restriction that results + * may published only if: + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id: s.lat_mem_rd.c 1.13 98/06/30 16:13:49-07:00 lm@xxxxxxxxxxxxxxx $\n"; + +#include "bench.h" +#define STRIDE (512/sizeof(char *)) +#define LOWER 512 +void loads(size_t len, size_t range, size_t stride, + int parallel, int warmup, int repetitions); +size_t step(size_t k); +void initialize(iter_t iterations, void* cookie); + +benchmp_f fpInit = stride_initialize; + +int +main(int ac, char **av) +{ + int i; + int c; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + size_t len; + size_t range; + size_t stride; + char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] [-t] len [stride...]\n"; + + while (( c = getopt(ac, av, "tP:W:N:")) != EOF) { + switch(c) { + case 't': + fpInit = thrash_initialize; + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind == ac) { + lmbench_usage(ac, av, usage); + } + + len = atoi(av[optind]); + len *= 1024 * 1024; + + if (optind == ac - 1) { + fprintf(stderr, "\"stride=%d\n", STRIDE); + for (range = LOWER; range <= len; range = step(range)) { + loads(len, range, STRIDE, parallel, + warmup, repetitions); + } + } else { + for (i = optind + 1; i < ac; ++i) { + stride = bytes(av[i]); + fprintf(stderr, "\"stride=%d\n", stride); + for (range = LOWER; range <= len; range = step(range)) { + loads(len, range, stride, parallel, + warmup, repetitions); + } + fprintf(stderr, "\n"); + } + } + return(0); +} + +#define ONE p = (char **)*p; +#define FIVE ONE ONE ONE ONE ONE +#define TEN FIVE FIVE +#define FIFTY TEN TEN TEN TEN TEN +#define HUNDRED FIFTY FIFTY + + +void +benchmark_loads(iter_t iterations, void *cookie) +{ + struct mem_state* state = (struct mem_state*)cookie; + register char **p = (char**)state->p[0]; + register size_t i; + register size_t count = state->len / (state->line * 100) + 1; + + while (iterations-- > 0) { + for (i = 0; i < count; ++i) { + HUNDRED; + } + } + + use_pointer((void *)p); + state->p[0] = (char*)p; +} + + +void +loads(size_t len, size_t range, size_t stride, + int parallel, int warmup, int repetitions) +{ + double result; + size_t count; + struct mem_state state; + + if (range < stride) return; + + state.width = 1; + state.len = range; + state.maxlen = len; + state.line = stride; + state.pagesize = getpagesize(); + count = 100 * (state.len / (state.line * 100) + 1); + +#if 0 + (*fpInit)(0, &state); + fprintf(stderr, "loads: after init\n"); + (*benchmark_loads)(2, &state); + fprintf(stderr, "loads: after benchmark\n"); + mem_cleanup(0, &state); + fprintf(stderr, "loads: after cleanup\n"); + settime(1); + save_n(1); +#else + /* + * Now walk them and time it. + */ + benchmp(fpInit, benchmark_loads, mem_cleanup, + 100000, parallel, warmup, repetitions, &state); +#endif + + /* We want to get to nanoseconds / load. */ + save_minimum(); + result = (1000. * (double)gettime()) / (double)(count * get_n()); + fprintf(stderr, "%.5f %.3f\n", range / (1024. * 1024.), result); + +} + +size_t +step(size_t k) +{ + if (k < 1024) { + k = k * 2; + } else if (k < 4*1024) { + k += 1024; + } else { + size_t s; + + for (s = 32 * 1024; s <= k; s *= 2) + ; + k += s / 16; + } + return (k); +} diff --git a/performance/lmbench3/src/lat_mmap.c b/performance/lmbench3/src/lat_mmap.c new file mode 100644 index 0000000..1a6445b --- /dev/null +++ b/performance/lmbench3/src/lat_mmap.c @@ -0,0 +1,175 @@ +/* + * lat_mmap.c - time how fast a mapping can be made and broken down + * + * Usage: mmap [-r] [-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size file + * + * XXX - If an implementation did lazy address space mapping, this test + * will make that system look very good. I haven't heard of such a system. + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +#define PSIZE (16<<10) +#define N 10 +#define STRIDE (10*PSIZE) +#define MINSIZE (STRIDE*2) + +#define CHK(x) if ((x) == -1) { perror("x"); exit(1); } + + +typedef struct _state { + size_t size; + int fd; + int random; + int clone; + char *name; +} state_t; + +void init(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); +void domapping(iter_t iterations, void * cookie); + +int +main(int ac, char **av) +{ + state_t state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + char buf[256]; + int c; + char *usage = "[-r] [-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size file\n"; + + + state.random = 0; + state.clone = 0; + while (( c = getopt(ac, av, "rP:W:N:C")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) + lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + case 'r': + state.random = 1; + break; + case 'C': + state.clone = 1; + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind + 2 != ac) { + lmbench_usage(ac, av, usage); + } + + state.size = bytes(av[optind]); + if (state.size < MINSIZE) { + return (1); + } + state.name = av[optind+1]; + + benchmp(init, domapping, cleanup, 0, parallel, + warmup, repetitions, &state); + + if (gettime() > 0) { + micromb(state.size, get_n()); + } + return (0); +} + +void +init(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + if (state->clone) { + char buf[128]; + char* s; + + /* copy original file into a process-specific one */ + sprintf(buf, "%d", (int)getpid()); + s = (char*)malloc(strlen(state->name) + strlen(buf) + 1); + sprintf(s, "%s%d", state->name, (int)getpid()); + if (cp(state->name, s, S_IREAD|S_IWRITE) < 0) { + perror("Could not copy file"); + unlink(s); + exit(1); + } + state->name = s; + } + CHK(state->fd = open(state->name, O_RDWR)); + if (state->clone) unlink(state->name); + if (lseek(state->fd, 0, SEEK_END) < state->size) { + fprintf(stderr, "Input file too small\n"); + exit(1); + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + close(state->fd); +} + +/* + * This alg due to Linus. The goal is to have both sparse and full + * mappings reported. + */ +void +domapping(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + register int fd = state->fd; + register size_t size = state->size; + register int random = state->random; + register char *p, *where, *end; + register char c = size & 0xff; + + while (iterations-- > 0) { + +#ifdef MAP_FILE + where = mmap(0, size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_SHARED, fd, 0); +#else + where = mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); +#endif + if ((long)where == -1) { + perror("mmap"); + exit(1); + } + if (random) { + end = where + size; + for (p = where; p < end; p += STRIDE) { + *p = c; + } + } else { + end = where + (size / N); + for (p = where; p < end; p += PSIZE) { + *p = c; + } + } + munmap(where, size); + } +} diff --git a/performance/lmbench3/src/lat_ops.c b/performance/lmbench3/src/lat_ops.c new file mode 100755 index 0000000..a86b449 --- /dev/null +++ b/performance/lmbench3/src/lat_ops.c @@ -0,0 +1,485 @@ +/* + * lat_ops.c - benchmark of simple operations + * + * Copyright (c) 1996-2004 Carl Staelin and Larry McVoy. + * + * This benchmark is meant to benchmark raw arithmetic operation + * latency for various operations on various datatypes. Obviously, + * not all operations make sense for all datatypes (e.g., modulus + * on float). The benchmarks are configured to use interlocking + * operations, so we measure the time of an individual operation. + * + * The exception to the interlocking operation guidelines are the + * vector operations, muladd and bogomflops, for both float and + * double data types. In this case we are trying to determine + * how well the CPU can schedule the various arithmetic units + * and overlap adjacent operations to get the maximal throughput + * from the system. In addition, we are using relatively short + * vectors so these operations should be going to/from L1 (or + * possibly L2) cache, rather than main memory, which should + * reduce or eliminate the memory overheads. + * + * The vector operations use a slightly unrolled loop because + * this is common in scientific codes that do these sorts of + * operations. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +struct _state { + int N; + int M; + int K; + double* data; +}; + +#define FIVE(a) a a a a a +#define TEN(a) a a a a a a a a a a +#define HUNDRED(a) TEN(TEN(a)) + +void +float_initialize(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int i; + register float* x; + + if (iterations) return; + + x = (float*)malloc(pState->M * sizeof(float)); + pState->data = (double*)x; + for (i = 0; i < pState->M; ++i) { + x[i] = 3.14159265; + } +} + +void +double_initialize(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int i; + + if (iterations) return; + + pState->data = (double*)malloc(pState->M * sizeof(double)); + for (i = 0; i < pState->M; ++i) { + pState->data[i] = 3.14159265; + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + + if (iterations) return; + + if (pState->data) + free(pState->data); +} + +void +do_integer_bitwise(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int r = pState->N; + register int s = (int)iterations; + + while (iterations-- > 0) { + HUNDRED(r ^= iterations; s ^= r; r |= s;) + } + use_int(r); +} + +void +do_integer_add(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int a = pState->N + 57; + register int b = pState->N + 31; + + while (iterations-- > 0) { + HUNDRED(a += b; b -= a;) + } + use_int(a+b); +} + +void +do_integer_mul(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int r = pState->N + 37431; + register int s = pState->N + 4; + register int t = r * s * s * s * s * s * s * s * s * s * s - r; + + while (iterations-- > 0) { + TEN(r *= s;); r -= t; + TEN(r *= s;); r -= t; + } + use_int(r); +} + +void +do_integer_div(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int r = pState->N + 36; + register int s = (r + 1) << 20; + + while (iterations-- > 0) { + HUNDRED(r = s / r;) + } + use_int(r); +} + +void +do_integer_mod(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int r = pState->N + iterations; + register int s = pState->N + 62; + + while (iterations-- > 0) { + HUNDRED(r %= s; r |= s;) + } + use_int(r); +} + +void +do_int64_bitwise(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int64 r = (int64)pState->N | (int64)pState->N<<32; + register int64 s = (int64)iterations | (int64)iterations<<32; + register int64 i = (int64)iterations<<34 - 1; + + while (iterations-- > 0) { + HUNDRED(r ^= i; s ^= r; r |= s;) + i--; + } + use_int((int)r); +} + +void +do_int64_add(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int64 a = (int64)pState->N + 37420; + register int64 b = (int64)pState->N + 21698324; + + a += (int64)(0xFE + pState->N)<<30; + b += (int64)(0xFFFE + pState->N)<<29; + + while (iterations-- > 0) { + HUNDRED(a += b; b -= a;) + } + use_int((int)a+(int)b); +} + +void +do_int64_mul(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int64 r = (int64)pState->N + 37420; + register int64 s = (int64)pState->N + 4; + register int64 t; + + r += (int64)(pState->N + 6)<<32; + t = r * s * s * s * s * s * s * s * s * s * s - r; + + while (iterations-- > 0) { + TEN(r *= s;); r -= t; + TEN(r *= s;); r -= t; + } + use_int((int)r); +} + +void +do_int64_div(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int64 r = (int64)pState->N + 36; + register int64 s; + + r += r << 33; + s = (r + 17) << 13; + + while (iterations-- > 0) { + HUNDRED(r = s / r;) + } + use_int((int)r); +} + +void +do_int64_mod(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int64 r = iterations + (int64)iterations<<32; + register int64 s = (int64)pState->N + (int64)pState->N<<56; + + while (iterations-- > 0) { + HUNDRED(r %= s; r |= s;); + } + use_int((int)r); +} + +void +do_float_add(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register float f = (float)pState->N; + register float g = (float)pState->K; + + while (iterations-- > 0) { + TEN(f += (float)f;) f += (float)g; + TEN(f += (float)f;) f += (float)g; + } + use_int((int)f); + use_int((int)g); +} + +void +do_float_mul(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register float f = 8.0f * (float)pState->N; + register float g = 0.125f * (float)pState->M / 1000.0; + + while (iterations-- > 0) { + TEN(f *= f; f *= g;); + TEN(f *= f; f *= g;); + } + use_int((int)f); + use_int((int)g); +} + +void +do_float_div(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register float f = 1.41421356f * (float)pState->N; + register float g = 3.14159265f * (float)pState->M / 1000.0; + + while (iterations-- > 0) { + FIVE(TEN(f = g / f;) TEN(g = f / g;)) + } + use_int((int)f); + use_int((int)g); +} + +void +do_double_add(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register double f = (double)pState->N; + register double g = (double)pState->K; + + while (iterations-- > 0) { + TEN(f += (double)f;) f += (double)g; + TEN(f += (double)f;) f += (double)g; + } + use_int((int)f); + use_int((int)g); +} + +void +do_double_mul(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register double f = 8.0 * (double)pState->N; + register double g = 0.125 * (double)pState->M / 1000.0; + + while (iterations-- > 0) { + TEN(f *= f; f *= g;) + TEN(f *= f; f *= g;) + } + use_int((int)f); + use_int((int)g); +} + +void +do_double_div(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register double f = 1.41421356 * (double)pState->N; + register double g = 3.14159265 * (double)pState->M / 1000.0; + + while (iterations-- > 0) { + FIVE(TEN(f = g / f;) TEN(g = f / g;)) + } + use_int((int)f); + use_int((int)g); +} + +void +do_float_bogomflops(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int i; + register int M = pState->M / 10; + + while (iterations-- > 0) { + register float *x = (float*)pState->data; + for (i = 0; i < M; ++i) { + x[0] = (1.0f + x[0]) * (1.5f - x[0]) / x[0]; + x[1] = (1.0f + x[1]) * (1.5f - x[1]) / x[1]; + x[2] = (1.0f + x[2]) * (1.5f - x[2]) / x[2]; + x[3] = (1.0f + x[3]) * (1.5f - x[3]) / x[3]; + x[4] = (1.0f + x[4]) * (1.5f - x[4]) / x[4]; + x[5] = (1.0f + x[5]) * (1.5f - x[5]) / x[5]; + x[6] = (1.0f + x[6]) * (1.5f - x[6]) / x[6]; + x[7] = (1.0f + x[7]) * (1.5f - x[7]) / x[7]; + x[8] = (1.0f + x[8]) * (1.5f - x[8]) / x[8]; + x[9] = (1.0f + x[9]) * (1.5f - x[9]) / x[9]; + x += 10; + } + } +} + +void +do_double_bogomflops(iter_t iterations, void* cookie) +{ + struct _state *pState = (struct _state*)cookie; + register int i; + register int M = pState->M / 10; + + while (iterations-- > 0) { + register double *x = (double*)pState->data; + for (i = 0; i < M; ++i) { + x[0] = (1.0f + x[0]) * (1.5f - x[0]) / x[0]; + x[1] = (1.0f + x[1]) * (1.5f - x[1]) / x[1]; + x[2] = (1.0f + x[2]) * (1.5f - x[2]) / x[2]; + x[3] = (1.0f + x[3]) * (1.5f - x[3]) / x[3]; + x[4] = (1.0f + x[4]) * (1.5f - x[4]) / x[4]; + x[5] = (1.0f + x[5]) * (1.5f - x[5]) / x[5]; + x[6] = (1.0f + x[6]) * (1.5f - x[6]) / x[6]; + x[7] = (1.0f + x[7]) * (1.5f - x[7]) / x[7]; + x[8] = (1.0f + x[8]) * (1.5f - x[8]) / x[8]; + x[9] = (1.0f + x[9]) * (1.5f - x[9]) / x[9]; + x += 10; + } + } +} + +int +main(int ac, char **av) +{ + int __n = 1; + int c, i, j; + int warmup = 0; + int parallel = 1; + int repetitions = TRIES; + uint64 iop_time; + uint64 iop_N; + struct _state state; + char *usage = "[-W <warmup>] [-N <repetitions>] [-P <parallel>] \n"; + + state.N = 1; + state.M = 1000; + state.K = -1023; + state.data = NULL; + + while (( c = getopt(ac, av, "W:N:P:")) != EOF) { + switch(c) { + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + benchmp(NULL, do_integer_bitwise, NULL, + 0, 1, warmup, repetitions, &state); + nano("integer bit", get_n() * 100 * 3); + + benchmp(NULL, do_integer_add, NULL, + 0, 1, warmup, repetitions, &state); + nano("integer add", get_n() * 100 * 2); + iop_time = gettime(); + iop_N = get_n() * 100 * 2; + + benchmp(NULL, do_integer_mul, NULL, + 0, 1, warmup, repetitions, &state); + settime(gettime() - (get_n() * 2 * iop_time) / iop_N); + nano("integer mul", get_n() * 10 * 2); + + benchmp(NULL, do_integer_div, NULL, + 0, 1, warmup, repetitions, &state); + nano("integer div", get_n() * 100); + + benchmp(NULL, do_integer_mod, NULL, + 0, 1, warmup, repetitions, &state); + settime(gettime() - (get_n() * 100 * iop_time) / iop_N); + nano("integer mod", get_n() * 100); + + benchmp(NULL, do_int64_bitwise, NULL, + 0, 1, warmup, repetitions, &state); + nano("int64 bit", get_n() * 100 * 3); + iop_time = gettime(); + iop_N = get_n() * 100 * 3; + + benchmp(NULL, do_int64_add, NULL, + 0, 1, warmup, repetitions, &state); + nano("int64 add", get_n() * 100 * 2); + + benchmp(NULL, do_int64_mul, NULL, + 0, 1, warmup, repetitions, &state); + settime(gettime() - (get_n() * 2 * iop_time) / iop_N); + nano("int64 mul", get_n() * 10 * 2); + + benchmp(NULL, do_int64_div, NULL, + 0, 1, warmup, repetitions, &state); + nano("int64 div", get_n() * 100); + + benchmp(NULL, do_int64_mod, NULL, + 0, 1, warmup, repetitions, &state); + settime(gettime() - (get_n() * 100 * iop_time) / iop_N); + nano("int64 mod", get_n() * 100); + + benchmp(NULL, do_float_add, NULL, + 0, 1, warmup, repetitions, &state); + nano("float add", get_n() * (10 + 1) * 2); + + benchmp(NULL, do_float_mul, NULL, + 0, 1, warmup, repetitions, &state); + nano("float mul", get_n() * 10 * 2 * 2); + + benchmp(NULL, do_float_div, NULL, + 0, 1, warmup, repetitions, &state); + nano("float div", get_n() * 100); + + benchmp(NULL, do_double_add, NULL, + 0, 1, warmup, repetitions, &state); + nano("double add", get_n() * (10 + 1) * 2); + + benchmp(NULL, do_double_mul, NULL, + 0, 1, warmup, repetitions, &state); + nano("double mul", get_n() * 10 * 2 * 2); + + benchmp(NULL, do_double_div, NULL, + 0, 1, warmup, repetitions, &state); + nano("double div", get_n() * 100); + + benchmp(float_initialize, do_float_bogomflops, cleanup, + 0, parallel, warmup, repetitions, &state); + nano("float bogomflops", get_n() * state.M); + fflush(stdout); fflush(stderr); + + benchmp(double_initialize, do_double_bogomflops, cleanup, + 0, parallel, warmup, repetitions, &state); + nano("double bogomflops", get_n() * state.M); + fflush(stdout); fflush(stderr); + + return(0); +} + diff --git a/performance/lmbench3/src/lat_pagefault.c b/performance/lmbench3/src/lat_pagefault.c new file mode 100644 index 0000000..02af9f4 --- /dev/null +++ b/performance/lmbench3/src/lat_pagefault.c @@ -0,0 +1,202 @@ +/* + * lat_pagefault.c - time a page fault in + * + * Usage: lat_pagefault [-C] [-P <parallel>] [-W <warmup>] [-N <repetitions>] file + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +#define CHK(x) if ((x) == -1) { perror("x"); exit(1); } + +typedef struct _state { + int fd; + int size; + int npages; + int clone; + char* file; + char* where; + size_t* pages; +} state_t; + +void initialize(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); +void benchmark(iter_t iterations, void * cookie); +void benchmark_mmap(iter_t iterations, void * cookie); + +int +main(int ac, char **av) +{ + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + double t_mmap; + double t_combined; + struct stat st; + struct _state state; + char buf[2048]; + char* usage = "[-C] [-P <parallel>] [-W <warmup>] [-N <repetitions>] file\n"; + + state.clone = 0; + + while (( c = getopt(ac, av, "P:W:N:C")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + case 'C': + state.clone = 1; + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind != ac - 1 ) { + lmbench_usage(ac, av, usage); + } + + state.file = av[optind]; + CHK(stat(state.file, &st)); + state.npages = st.st_size / (size_t)getpagesize(); + +#ifdef MS_INVALIDATE + benchmp(initialize, benchmark_mmap, cleanup, 0, parallel, + warmup, repetitions, &state); + t_mmap = gettime() / (double)get_n(); + + benchmp(initialize, benchmark, cleanup, 0, parallel, + warmup, repetitions, &state); + t_combined = gettime() / (double)get_n(); + settime(get_n() * (t_combined - t_mmap)); + + sprintf(buf, "Pagefaults on %s", state.file); + micro(buf, state.npages * get_n()); +#endif + return(0); +} + +void +initialize(iter_t iterations, void* cookie) +{ + int i, npages, pagesize; + int *p; + unsigned int r; + struct stat sbuf; + state_t *state = (state_t *) cookie; + + if (iterations) return; + + if (state->clone) { + char buf[128]; + char* s; + + /* copy original file into a process-specific one */ + sprintf(buf, "%d", (int)getpid()); + s = (char*)malloc(strlen(state->file) + strlen(buf) + 1); + sprintf(s, "%s%d", state->file, (int)getpid()); + if (cp(state->file, s, S_IREAD|S_IWRITE) < 0) { + perror("Could not copy file"); + unlink(s); + exit(1); + } + state->file = s; + } + CHK(state->fd = open(state->file, 0)); + if (state->clone) unlink(state->file); + CHK(fstat(state->fd, &sbuf)); + + srand(getpid()); + pagesize = getpagesize(); + state->size = sbuf.st_size; + state->size -= state->size % pagesize; + state->npages = state->size / pagesize; + state->pages = permutation(state->npages, pagesize); + + if (state->size < 1024*1024) { + fprintf(stderr, "lat_pagefault: %s too small\n", state->file); + exit(1); + } + state->where = mmap(0, state->size, + PROT_READ, MAP_SHARED, state->fd, 0); + +#ifdef MS_INVALIDATE + if (msync(state->where, state->size, MS_INVALIDATE) != 0) { + perror("msync"); + exit(1); + } +#endif +} + +void +cleanup(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + munmap(state->where, state->size); + if (state->fd >= 0) close(state->fd); + free(state->pages); +} + +void +benchmark(iter_t iterations, void* cookie) +{ + int i; + int sum = 0; + state_t *state = (state_t *) cookie; + + while (iterations-- > 0) { + for (i = 0; i < state->npages; ++i) { + sum += *(state->where + state->pages[i]); + } + munmap(state->where, state->size); + state->where = mmap(0, state->size, + PROT_READ, MAP_SHARED, state->fd, 0); +#ifdef MS_INVALIDATE + if (msync(state->where, state->size, MS_INVALIDATE) != 0) { + perror("msync"); + exit(1); + } +#endif + } + use_int(sum); +} + +void +benchmark_mmap(iter_t iterations, void* cookie) +{ + int i; + int sum = 0; + state_t *state = (state_t *) cookie; + + while (iterations-- > 0) { + munmap(state->where, state->size); + state->where = mmap(0, state->size, + PROT_READ, MAP_SHARED, state->fd, 0); +#ifdef MS_INVALIDATE + if (msync(state->where, state->size, MS_INVALIDATE) != 0) { + perror("msync"); + exit(1); + } +#endif + } + use_int(sum); +} + diff --git a/performance/lmbench3/src/lat_pipe.c b/performance/lmbench3/src/lat_pipe.c new file mode 100644 index 0000000..bdf2a79 --- /dev/null +++ b/performance/lmbench3/src/lat_pipe.c @@ -0,0 +1,155 @@ +/* + * lat_pipe.c - pipe transaction test + * + * usage: lat_pipe [-P <parallelism>] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +void initialize(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); +void doit(iter_t iterations, void *cookie); +void writer(int w, int r); + +typedef struct _state { + int pid; + int p1[2]; + int p2[2]; +} state_t; + +int +main(int ac, char **av) +{ + state_t state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind < ac) { + lmbench_usage(ac, av, usage); + } + + state.pid = 0; + + benchmp(initialize, doit, cleanup, SHORT, parallel, + warmup, repetitions, &state); + micro("Pipe latency", get_n()); + return (0); +} + +void +initialize(iter_t iterations, void* cookie) +{ + char c; + state_t * state = (state_t *)cookie; + + if (iterations) return; + + if (pipe(state->p1) == -1) { + perror("pipe"); + exit(1); + } + if (pipe(state->p2) == -1) { + perror("pipe"); + exit(1); + } + handle_scheduler(benchmp_childid(), 0, 1); + switch (state->pid = fork()) { + case 0: + handle_scheduler(benchmp_childid(), 1, 1); + signal(SIGTERM, exit); + close(state->p1[1]); + close(state->p2[0]); + writer(state->p2[1], state->p1[0]); + return; + + case -1: + perror("fork"); + return; + + default: + close(state->p1[0]); + close(state->p2[1]); + break; + } + + /* + * One time around to make sure both processes are started. + */ + if (write(state->p1[1], &c, 1) != 1 || read(state->p2[0], &c, 1) != 1){ + perror("(i) read/write on pipe"); + exit(1); + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + state_t * state = (state_t *)cookie; + + if (iterations) return; + + if (state->pid) { + kill(state->pid, SIGKILL); + waitpid(state->pid, NULL, 0); + state->pid = 0; + } +} + +void +doit(register iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + char c; + register int w = state->p1[1]; + register int r = state->p2[0]; + register char *cptr = &c; + + while (iterations-- > 0) { + if (write(w, cptr, 1) != 1 || + read(r, cptr, 1) != 1) { + perror("(r) read/write on pipe"); + exit(1); + } + } +} + +void +writer(register int w, register int r) +{ + char c; + register char *cptr = &c; + + for ( ;; ) { + if (read(r, cptr, 1) != 1 || + write(w, cptr, 1) != 1) { + perror("(w) read/write on pipe"); + } + } +} diff --git a/performance/lmbench3/src/lat_pmake.c b/performance/lmbench3/src/lat_pmake.c new file mode 100644 index 0000000..8d898eb --- /dev/null +++ b/performance/lmbench3/src/lat_pmake.c @@ -0,0 +1,158 @@ +/* + * lat_pmake.c - time to complete N jobs which each do usecs worth of work + * + * usage: lat_pipe [-P <parallelism>] [-W <warmup>] [-N <repetitions>] jobs usecs + * + * Copyright (c) 1994 Larry McVoy. + * Copyright (c) 2002 Carl Staelin. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +void setup(iter_t iterations, void* cookie); +void bench(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); +void work(iter_t iterations, void *cookie); + +typedef struct _state { + int jobs; /* number of jobs to create */ + iter_t iterations; /* how long each job should work */ + long* x; /* used by work() */ + long** p; + pid_t* pids; +} state_t; + +int +main(int ac, char **av) +{ + state_t state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + double time; + uint64 usecs; + char buf[1024]; + char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] Njobs usecs...\n"; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (ac < optind + 2) { + lmbench_usage(ac, av, usage); + } + state.jobs = atoi(av[optind]); + state.pids = NULL; + fprintf(stderr, "\"pmake jobs=%d\n", state.jobs); + while (++optind < ac) { + usecs = bytes(av[optind]); + benchmp(setup, work, NULL, 0, 1, 0, TRIES, &state); + if (gettime() == 0) exit(1); + state.iterations = (iter_t)((usecs * get_n()) / gettime()); + + benchmp(setup, bench, NULL, 0, parallel, + warmup, repetitions, &state); + time = gettime(); + time /= get_n(); + if (time > 0.0) + fprintf(stderr, "%llu %.2f\n", usecs, time); + } + return (0); +} + +void +setup(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + state->x = (long*)malloc(sizeof(long*)); + *(long**)state->x = state->x; + state->p = (long**)state->x; + + handle_scheduler(benchmp_childid(), 0, state->jobs); +} + +void +bench(register iter_t iterations, void *cookie) +{ + int i; + int status; + state_t *state = (state_t *) cookie; + + state->pids = (pid_t*)malloc(state->jobs * sizeof(pid_t)); + + /* + * This design has one buglet --- we cannot detect if the + * worker process died prematurely. I.e., we don't have + * a handshake step to collect "I finished correctly" + * messages. + */ + while (iterations-- > 0) { + for (i = 0; i < state->jobs; ++i) { + if ((state->pids[i] = fork()) == 0) { + handle_scheduler(benchmp_childid(), i+1, state->jobs); + work(state->iterations, state); + exit(0); + } + } + for (i = 0; i < state->jobs; ++i) { + waitpid(state->pids[i], &status, 0); + state->pids[i] = -1; + + /* child died badly */ + if (!WIFEXITED(status)) { + cleanup(0, cookie); + exit(1); + } + } + } +} + +void +cleanup(register iter_t iterations, void *cookie) +{ + int i; + state_t *state = (state_t *) cookie; + + for (i = 0; i < state->jobs; ++i) { + if (state->pids[i] > 0) { + kill(state->pids[i], SIGKILL); + waitpid(state->pids[i], NULL, 0); + state->pids[i] = -1; + } + } +} + +void +work(register iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + register long** p = state->p; + +#define WORK_TEN(one) one one one one one one one one one one + while (iterations-- > 0) { + WORK_TEN(p = (long**) *p;); + } + state->p = p; +} diff --git a/performance/lmbench3/src/lat_proc.c b/performance/lmbench3/src/lat_proc.c new file mode 100644 index 0000000..e36e19d --- /dev/null +++ b/performance/lmbench3/src/lat_proc.c @@ -0,0 +1,182 @@ +/* + * lat_proc.c - process creation tests + * + * Usage: lat_proc [-P <parallelism] [-W <warmup>] [-N <repetitions>] procedure|fork|exec|shell + * + * TODO - linux clone, plan9 rfork, IRIX sproc(). + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + + +#ifdef STATIC +#define PROG "/tmp/hello-s" +#define STATIC_PREFIX "Static " +#else +#define PROG "/tmp/hello" +#define STATIC_PREFIX "" +#endif + +void do_shell(iter_t iterations, void* cookie); +void do_forkexec(iter_t iterations,void* cookie); +void do_fork(iter_t iterations, void* cookie); +void do_procedure(iter_t iterations, void* cookie); + +pid_t child_pid; + + +void +cleanup(iter_t iterations, void* cookie) +{ + if (iterations) return; + + if (child_pid) { + kill(child_pid, SIGKILL); + waitpid(child_pid, NULL, 0); + child_pid = 0; + } +} + +int +main(int ac, char **av) +{ + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] procedure|fork|exec|shell\n"; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind + 1 != ac) { /* should have one argument left */ + lmbench_usage(ac, av, usage); + } + + if (!strcmp("procedure", av[optind])) { + benchmp(NULL, do_procedure, cleanup, 0, parallel, + warmup, repetitions, &ac); + micro("Procedure call", get_n()); + } else if (!strcmp("fork", av[optind])) { + benchmp(NULL, do_fork, cleanup, 0, parallel, + warmup, repetitions, NULL); + micro(STATIC_PREFIX "Process fork+exit", get_n()); + } else if (!strcmp("exec", av[optind])) { + benchmp(NULL, do_forkexec, cleanup, 0, parallel, + warmup, repetitions, NULL); + micro(STATIC_PREFIX "Process fork+execve", get_n()); + } else if (!strcmp("shell", av[optind])) { + benchmp(NULL, do_shell, cleanup, 0, parallel, + warmup, repetitions, NULL); + micro(STATIC_PREFIX "Process fork+/bin/sh -c", get_n()); + } else { + lmbench_usage(ac, av, usage); + } + return(0); +} + +void +do_shell(iter_t iterations, void* cookie) +{ + signal(SIGCHLD, SIG_DFL); + handle_scheduler(benchmp_childid(), 0, 1); + while (iterations-- > 0) { + switch (child_pid = fork()) { + case -1: + perror("fork"); + exit(1); + + case 0: /* child */ + handle_scheduler(benchmp_childid(), 1, 1); + close(1); + execlp("/bin/sh", "sh", "-c", PROG, 0); + exit(1); + + default: + waitpid(child_pid, NULL,0); + } + child_pid = 0; + } +} + +void +do_forkexec(iter_t iterations, void* cookie) +{ + char *nav[2]; + + signal(SIGCHLD, SIG_DFL); + handle_scheduler(benchmp_childid(), 0, 1); + while (iterations-- > 0) { + nav[0] = PROG; + nav[1] = 0; + switch (child_pid = fork()) { + case -1: + perror("fork"); + exit(1); + + case 0: /* child */ + handle_scheduler(benchmp_childid(), 1, 1); + close(1); + execve(PROG, nav, 0); + exit(1); + + default: + waitpid(child_pid, NULL,0); + } + child_pid = 0; + } +} + +void +do_fork(iter_t iterations, void* cookie) +{ + signal(SIGCHLD, SIG_DFL); + handle_scheduler(benchmp_childid(), 0, 1); + while (iterations-- > 0) { + switch (child_pid = fork()) { + case -1: + perror("fork"); + exit(1); + + case 0: /* child */ + handle_scheduler(benchmp_childid(), 1, 1); + exit(1); + + default: + waitpid(child_pid, NULL,0); + } + child_pid = 0; + } +} + +void +do_procedure(iter_t iterations, void* cookie) +{ + int r = *(int *) cookie; + handle_scheduler(benchmp_childid(), 0, 1); + while (iterations-- > 0) { + use_int(r); + } +} diff --git a/performance/lmbench3/src/lat_rand.c b/performance/lmbench3/src/lat_rand.c new file mode 100644 index 0000000..42b3aaf --- /dev/null +++ b/performance/lmbench3/src/lat_rand.c @@ -0,0 +1,120 @@ +/* + * lat_rand.c - random number generation + * + * usage: lat_rand [-P <parallelism>] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 2002 Carl Staelin. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Hewlett-Packard is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +#ifdef HAVE_DRAND48 +void bench_drand48(iter_t iterations, void *cookie); +void bench_lrand48(iter_t iterations, void *cookie); +#endif +#ifdef HAVE_RAND +void bench_rand(iter_t iterations, void *cookie); +#endif +#ifdef HAVE_RANDOM +void bench_random(iter_t iterations, void *cookie); +#endif +int +main(int ac, char **av) +{ + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind < ac) { + lmbench_usage(ac, av, usage); + } + +#ifdef HAVE_DRAND48 + benchmp(NULL, bench_drand48, NULL, + 0, parallel, warmup, repetitions, NULL); + nano("drand48 latency", get_n()); + + benchmp(NULL, bench_lrand48, NULL, + 0, parallel, warmup, repetitions, NULL); + nano("lrand48 latency", get_n()); +#endif +#ifdef HAVE_RAND + benchmp(NULL, bench_rand, NULL, + 0, parallel, warmup, repetitions, NULL); + nano("rand latency", get_n()); +#endif +#ifdef HAVE_RANDOM + benchmp(NULL, bench_random, NULL, + 0, parallel, warmup, repetitions, NULL); + nano("random latency", get_n()); +#endif + return (0); +} + +#ifdef HAVE_DRAND48 +void +bench_drand48(register iter_t iterations, void *cookie) +{ + register double v = 0.0; + while (iterations-- > 0) { + v += drand48(); + } + use_int((int)v); +} + +void +bench_lrand48(register iter_t iterations, void *cookie) +{ + register long v = 0.0; + while (iterations-- > 0) { + v += lrand48(); + } + use_int((int)v); +} +#endif /* HAVE_DRAND48 */ +#ifdef HAVE_RAND +void +bench_rand(register iter_t iterations, void *cookie) +{ + register int v = 0.0; + while (iterations-- > 0) { + v += rand(); + } + use_int((int)v); +} +#endif /* HAVE_RAND */ +#ifdef HAVE_RANDOM +void +bench_random(register iter_t iterations, void *cookie) +{ + register int v = 0.0; + while (iterations-- > 0) { + v += random(); + } + use_int((int)v); +} +#endif /* HAVE_RANDOM */ diff --git a/performance/lmbench3/src/lat_rpc.c b/performance/lmbench3/src/lat_rpc.c new file mode 100644 index 0000000..3ebfb16 --- /dev/null +++ b/performance/lmbench3/src/lat_rpc.c @@ -0,0 +1,285 @@ +/* + * lat_rpc.c - simple RPC transaction latency test + * + * Four programs in one - + * server usage: lat_rpc -s + * client usage: lat_rpc hostname + * client usage: lat_rpc -p tcp hostname + * client usage: lat_rpc -p udp hostname + * shutdown: lat_rpc -S hostname + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; +#include "bench.h" + +void client_main(int ac, char **av); +void server_main(); +void benchmark(iter_t iterations, void* _state); +char *client_rpc_xact_1(char *argp, CLIENT *clnt); + +void +doit(CLIENT *cl, char *server) +{ + char c = 1; + char *resp; + + resp = client_rpc_xact_1(&c, cl); + if (!resp) { + clnt_perror(cl, server); + exit(1); + } + if (*resp != 123) { + fprintf(stderr, "lat_rpc: got bad data\n"); + exit(1); + } +} + + +/* Default timeout can be changed using clnt_control() */ +static struct timeval TIMEOUT = { 0, 25000 }; + +char *proto[] = { "tcp", "udp", 0 }; + +typedef struct state_ { + int msize; + char *server; + char *protocol; + CLIENT *cl; +} state_t; + +void +initialize(iter_t iterations, void* cookie) +{ + struct timeval tv; + state_t *state = (state_t*)cookie; + + if (iterations) return; + + state->cl = clnt_create(state->server, XACT_PROG, XACT_VERS, + state->protocol); + if (!state->cl) { + clnt_pcreateerror(state->server); + exit(1); + } + if (strcasecmp(state->protocol, proto[1]) == 0) { + tv.tv_sec = 0; + tv.tv_usec = 2500; + if (!clnt_control(state->cl, CLSET_RETRY_TIMEOUT, (char *)&tv)) { + clnt_perror(state->cl, "setting timeout"); + exit(1); + } + } +} + +void +benchmark(iter_t iterations, void* _state) +{ + state_t* state = (state_t*)_state; + char buf[256]; + + while (iterations-- > 0) { + doit(state->cl, state->server); + } +} + +int +main(int ac, char **av) +{ + int i; + int c; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int server = 0; + int shutdown = 0; + state_t state; + CLIENT *cl; + char buf[1024]; + char *protocol = NULL; + char *usage = "-s\n OR [-p <tcp|udp>] [-P parallel] [-W <warmup>] [-N <repetitions>] serverhost\n OR -S serverhost\n"; + + state.msize = 1; + + while (( c = getopt(ac, av, "sS:m:p:P:W:N:")) != EOF) { + switch(c) { + case 's': /* Server */ + if (fork() == 0) { + server_main(); + } + exit(0); + case 'S': /* shutdown serverhost */ + { + cl = clnt_create(optarg, XACT_PROG, XACT_VERS, "udp"); + if (!cl) { + clnt_pcreateerror(state.server); + exit(1); + } + clnt_call(cl, RPC_EXIT, (xdrproc_t)xdr_void, 0, + (xdrproc_t)xdr_void, 0, TIMEOUT); + exit(0); + } + case 'm': + state.msize = atoi(optarg); + break; + case 'p': + protocol = optarg; + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) + lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind != ac - 1) { + lmbench_usage(ac, av, usage); + } + + state.server = av[optind++]; + + if (protocol == NULL || !strcasecmp(protocol, proto[0])) { + state.protocol = proto[0]; + benchmp(initialize, benchmark, NULL, MEDIUM, parallel, + warmup, repetitions, &state); + sprintf(buf, "RPC/%s latency using %s", proto[0], state.server); + micro(buf, get_n()); + } + + if (protocol == NULL || !strcasecmp(protocol, proto[1])) { + state.protocol = proto[1]; + benchmp(initialize, benchmark, NULL, MEDIUM, parallel, + warmup, repetitions, &state); + sprintf(buf, "RPC/%s latency using %s", proto[1], state.server); + micro(buf, get_n()); + } + + exit(0); +} + +char * +client_rpc_xact_1(char *argp, CLIENT *clnt) +{ + static char res; + + bzero((void*)&res, sizeof(res)); + if (clnt_call(clnt, RPC_XACT, (xdrproc_t)xdr_char, + argp, (xdrproc_t)xdr_char, &res, TIMEOUT) != RPC_SUCCESS) { + return (NULL); + } + return (&res); +} + +/* + * The remote procedure[s] that will be called + */ +/* ARGSUSED */ +char * +rpc_xact_1(msg, transp) + char *msg; + register SVCXPRT *transp; +{ + static char r = 123; + + return &r; +} + +static void xact_prog_1(); + +void +server_main() +{ + register SVCXPRT *transp; + + GO_AWAY; + + (void) pmap_unset(XACT_PROG, XACT_VERS); + + transp = svcudp_create(RPC_ANYSOCK); + if (transp == NULL) { + fprintf(stderr, "cannot create udp service.\n"); + exit(1); + } + if (!svc_register(transp, XACT_PROG, XACT_VERS, xact_prog_1, IPPROTO_UDP)) { + fprintf(stderr, "unable to register (XACT_PROG, XACT_VERS, udp).\n"); + exit(1); + } + + transp = svctcp_create(RPC_ANYSOCK, 0, 0); + if (transp == NULL) { + fprintf(stderr, "cannot create tcp service.\n"); + exit(1); + } + if (!svc_register(transp, XACT_PROG, XACT_VERS, xact_prog_1, IPPROTO_TCP)) { + fprintf(stderr, "unable to register (XACT_PROG, XACT_VERS, tcp).\n"); + exit(1); + } + + svc_run(); + fprintf(stderr, "svc_run returned\n"); + exit(1); + /* NOTREACHED */ +} + +static void +xact_prog_1(rqstp, transp) + struct svc_req *rqstp; + register SVCXPRT *transp; +{ + union { + char rpc_xact_1_arg; + } argument; + char *result; + bool_t (*xdr_argument)(), (*xdr_result)(); + char *(*local)(); + + switch (rqstp->rq_proc) { + case NULLPROC: + (void) svc_sendreply(transp, (xdrproc_t)xdr_void, (char *)NULL); + return; + + case RPC_XACT: + xdr_argument = xdr_char; + xdr_result = xdr_char; + local = (char *(*)()) rpc_xact_1; + break; + + case RPC_EXIT: + (void) svc_sendreply(transp, (xdrproc_t)xdr_void, (char *)NULL); + (void) pmap_unset(XACT_PROG, XACT_VERS); + exit(0); + + default: + svcerr_noproc(transp); + return; + } + bzero((char *)&argument, sizeof(argument)); + if (!svc_getargs(transp, (xdrproc_t)xdr_argument, (char*)&argument)) { + svcerr_decode(transp); + return; + } + result = (*local)(&argument, rqstp); + if (result != NULL && !svc_sendreply(transp, (xdrproc_t)xdr_result, result)) { + svcerr_systemerr(transp); + } + if (!svc_freeargs(transp, (xdrproc_t)xdr_argument, (char*)&argument)) { + fprintf(stderr, "unable to free arguments\n"); + exit(1); + } + return; +} diff --git a/performance/lmbench3/src/lat_select.c b/performance/lmbench3/src/lat_select.c new file mode 100644 index 0000000..583b505 --- /dev/null +++ b/performance/lmbench3/src/lat_select.c @@ -0,0 +1,223 @@ +/* + * lat_select.c - time select system call + * + * usage: lat_select [-P <parallelism>] [-W <warmup>] [-N <repetitions>] [n] + * + * Copyright (c) 1996 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +void initialize(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); +void doit(iter_t iterations, void *cookie); +void writer(int w, int r); +void server(void* cookie); + +typedef int (*open_f)(void* cookie); +int open_file(void* cookie); +int open_socket(void* cookie); + +typedef struct _state { + char fname[L_tmpnam]; + open_f fid_f; + pid_t pid; + int sock; + int fid; + int num; + int max; + fd_set set; +} state_t; + +int +main(int ac, char **av) +{ + state_t state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char* usage = "[-n <#descriptors>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] file|tcp\n"; + char buf[256]; + + morefds(); /* bump fd_cur to fd_max */ + state.num = 200; + while (( c = getopt(ac, av, "P:W:N:n:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + case 'n': + state.num = bytes(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind + 1 != ac) { + lmbench_usage(ac, av, usage); + } + + if (streq("tcp", av[optind])) { + state.fid_f = open_socket; + server(&state); + benchmp(initialize, doit, cleanup, 0, parallel, + warmup, repetitions, &state); + sprintf(buf, "Select on %d tcp fd's", state.num); + kill(state.pid, SIGKILL); + waitpid(state.pid, NULL, 0); + micro(buf, get_n()); + } else if (streq("file", av[optind])) { + state.fid_f = open_file; + server(&state); + benchmp(initialize, doit, cleanup, 0, parallel, + warmup, repetitions, &state); + unlink(state.fname); + sprintf(buf, "Select on %d fd's", state.num); + micro(buf, get_n()); + } else { + lmbench_usage(ac, av, usage); + } + + exit(0); +} + +void +server(void* cookie) +{ + int pid; + state_t* state = (state_t*)cookie; + + pid = getpid(); + state->pid = 0; + + if (state->fid_f == open_file) { + /* Create a temporary file for clients to open */ + sprintf(state->fname, "lat_selectXXXXXX"); + state->fid = mkstemp(state->fname); + if (state->fid <= 0) { + char buf[L_tmpnam+128]; + sprintf(buf, "lat_select: Could not create temp file %s", state->fname); + perror(buf); + exit(1); + } + close(state->fid); + return; + } + + /* Create a socket for clients to connect to */ + state->sock = tcp_server(TCP_SELECT, SOCKOPT_REUSE); + if (state->sock <= 0) { + perror("lat_select: Could not open tcp server socket"); + exit(1); + } + + /* Start a server process to accept client connections */ + switch(state->pid = fork()) { + case 0: + /* child server process */ + while (pid == getppid()) { + int newsock = tcp_accept(state->sock, SOCKOPT_NONE); + read(newsock, &state->fid, 1); + close(newsock); + } + exit(0); + case -1: + /* error */ + perror("lat_select::server(): fork() failed"); + exit(1); + default: + break; + } +} + +int +open_socket(void* cookie) +{ + return tcp_connect("localhost", TCP_SELECT, SOCKOPT_NONE); +} + +int +open_file(void* cookie) +{ + state_t* state = (state_t*)cookie; + + return open(state->fname, O_RDONLY); +} + +void +doit(iter_t iterations, void * cookie) +{ + state_t * state = (state_t *)cookie; + fd_set nosave; + static struct timeval tv; + static count = 0; + + tv.tv_sec = 0; + tv.tv_usec = 0; + + while (iterations-- > 0) { + nosave = state->set; + select(state->num, 0, &nosave, 0, &tv); + } +} + +void +initialize(iter_t iterations, void *cookie) +{ + char c; + state_t * state = (state_t *)cookie; + int n, last = 0 /* lint */; + int N = state->num, fid, fd; + + if (iterations) return; + + fid = (*state->fid_f)(cookie); + if (fid <= 0) { + perror("Could not open device"); + exit(1); + } + state->max = 0; + FD_ZERO(&(state->set)); + for (n = 0; n < N; n++) { + fd = dup(fid); + if (fd == -1) break; + if (fd > state->max) + state->max = fd; + FD_SET(fd, &(state->set)); + } + state->max++; + close(fid); + if (n != N) + exit(1); +} + +void +cleanup(iter_t iterations, void *cookie) +{ + int i; + state_t * state = (state_t *)cookie; + + if (iterations) return; + + for (i = 0; i <= state->max; ++i) { + if (FD_ISSET(i, &(state->set))) + close(i); + } + FD_ZERO(&(state->set)); +} + + diff --git a/performance/lmbench3/src/lat_sem.c b/performance/lmbench3/src/lat_sem.c new file mode 100644 index 0000000..fac0d81 --- /dev/null +++ b/performance/lmbench3/src/lat_sem.c @@ -0,0 +1,162 @@ +/* + * lat_sem.c - semaphore test + * + * usage: lat_sem [-P <parallelism>] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" +#include <sys/sem.h> + +void initialize(iter_t iterations, void *cookie); +void cleanup(iter_t iterations, void *cookie); +void doit(iter_t iterations, void *cookie); +void writer(int sid); + +typedef struct _state { + int pid; + int semid; +} state_t; + +int +main(int ac, char **av) +{ + state_t state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind < ac) { + lmbench_usage(ac, av, usage); + } + + state.pid = 0; + + benchmp(initialize, doit, cleanup, SHORT, parallel, + warmup, repetitions, &state); + micro("Semaphore latency", get_n() * 2); + return (0); +} + +void +initialize(iter_t iterations, void* cookie) +{ + char c; + state_t * state = (state_t *)cookie; + + if (iterations) return; + + state->semid = semget(IPC_PRIVATE, 2, IPC_CREAT | IPC_EXCL | 0600); + semctl(state->semid, 0, SETVAL, 0); + semctl(state->semid, 1, SETVAL, 0); + + handle_scheduler(benchmp_childid(), 0, 1); + switch (state->pid = fork()) { + case 0: + signal(SIGTERM, exit); + handle_scheduler(benchmp_childid(), 1, 1); + writer(state->semid); + return; + + case -1: + perror("fork"); + return; + + default: + break; + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + state_t * state = (state_t *)cookie; + + if (iterations) return; + + if (state->pid) { + kill(state->pid, SIGKILL); + waitpid(state->pid, NULL, 0); + state->pid = 0; + } + /* free the semaphores */ + semctl(state->semid, 0, IPC_RMID); +} + +void +doit(register iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + struct sembuf sop[2]; + + sop[0].sem_num = 1; + sop[0].sem_op = -1; + sop[0].sem_flg = 0; + + sop[1].sem_num = 0; + sop[1].sem_op = 1; + sop[1].sem_flg = 0; + + while (iterations-- > 0) { + if (semop(state->semid, sop, 2) < 0) { + perror("(r) error on semaphore"); + exit(1); + } + } +} + +void +writer(register int sid) +{ + struct sembuf sop[2]; + + sop[0].sem_num = 1; + sop[0].sem_op = 1; + sop[0].sem_flg = 0; + + if (semop(sid, sop, 1) < 0) { + perror("(w) error on initial semaphore"); + exit(1); + } + + sop[0].sem_num = 0; + sop[0].sem_op = -1; + sop[0].sem_flg = 0; + + sop[1].sem_num = 1; + sop[1].sem_op = 1; + sop[1].sem_flg = 0; + + for ( ;; ) { + if (semop(sid, sop, 2) < 0) { + perror("(w) error on semaphore"); + exit(1); + } + } +} diff --git a/performance/lmbench3/src/lat_sig.c b/performance/lmbench3/src/lat_sig.c new file mode 100644 index 0000000..46aef0e --- /dev/null +++ b/performance/lmbench3/src/lat_sig.c @@ -0,0 +1,213 @@ +/* + * lat_sig.c - signal handler test + * + * XXX - this benchmark requires the POSIX sigaction interface. The reason + * for that is that the signal handler stays installed with that interface. + * The more portable signal() interface may or may not stay installed and + * reinstalling it each time is expensive. + * + * XXX - should really do a two process version. + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + */ +char *id = "$Id$\n"; + +#include "bench.h" +#include <setjmp.h> + +uint64 caught, n; +double adj; +void handler() { } +jmp_buf prot_env; + +void +do_send(iter_t iterations, void* cookie) +{ + int me = getpid(); + + while (--iterations > 0) { + kill(me, 0); + } +} + +void +do_install(iter_t iterations, void* cookie) +{ + struct sigaction sa, old; + + while (iterations-- > 0) { + sa.sa_handler = handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sigaction(SIGUSR1, &sa, &old); + } +} + +void +do_catch(iter_t iterations, void* cookie) +{ + int me = getpid(); + struct sigaction sa, old; + double u; + + sa.sa_handler = handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sigaction(SIGUSR1, &sa, &old); + + while (--iterations > 0) { + kill(me, SIGUSR1); + } +} + +struct _state { + char* fname; + char* where; +}; + +void +prot() { + if (++caught == n) { + caught = 0; + n = benchmp_interval(benchmp_getstate()); + } +} + +void +initialize(iter_t iterations, void* cookie) +{ + struct _state* state = (struct _state*)cookie; + int fd; + struct sigaction sa; + + if (iterations) return; + + fd = open(state->fname, 0); + state->where = mmap(0, 4096, PROT_READ, MAP_SHARED, fd, 0); + if ((long)state->where == -1) { + perror("mmap"); + exit(1); + } + + sa.sa_handler = prot; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sigaction(SIGSEGV, &sa, 0); + sigaction(SIGBUS, &sa, 0); +} + +void +do_prot(iter_t iterations, void* cookie) +{ + struct _state* state = (struct _state*)cookie; + + n = iterations; + caught = 0; + + /* start the first timing interval */ + start(0); + + /* trigger the page fault, causing us to jump to prot() */ + *state->where = 1; +} + +/* + * Cost of catching the signal less the cost of sending it + */ +void +bench_catch(int parallel, int warmup, int repetitions) +{ + uint64 t, send_usecs, send_n; + + /* measure cost of sending signal */ + benchmp(NULL, do_send, NULL, 0, parallel, + warmup, repetitions, NULL); + send_usecs = gettime(); + send_n = get_n(); + + /* measure cost of sending & catching signal */ + benchmp(NULL, do_catch, NULL, 0, parallel, + warmup, repetitions, NULL); + + /* subtract cost of sending signal */ + if (gettime() > (send_usecs * get_n()) / send_n) { + settime(gettime() - (send_usecs * get_n()) / send_n); + } else { + settime(0); + } +} + +void +bench_prot(char* fname, int parallel, int warmup, int repetitions) +{ + uint64 catch_usecs, catch_n; + struct _state state; + + state.fname = fname; + + /* + * Catch protection faults. + * Assume that they will cost the same as a normal catch. + */ + bench_catch(parallel, warmup, repetitions); + catch_usecs = gettime(); + catch_n = get_n(); + + benchmp(initialize, do_prot, NULL, 0, parallel, + warmup, repetitions, &state); + if (gettime() > (catch_usecs * get_n()) / catch_n) { + settime(gettime() - (catch_usecs * get_n()) / catch_n); + } else { + settime(0); + } +} + + +int +main(int ac, char **av) +{ + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] install|catch|prot [file]\n"; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind != ac - 1 && optind != ac - 2) { + lmbench_usage(ac, av, usage); + } + + if (!strcmp("install", av[optind])) { + benchmp(NULL, do_install, NULL, 0, parallel, + warmup, repetitions, NULL); + micro("Signal handler installation", get_n()); + } else if (!strcmp("catch", av[optind])) { + bench_catch(parallel, warmup, repetitions); + micro("Signal handler overhead", get_n()); + } else if (!strcmp("prot", av[optind]) && optind == ac - 2) { + bench_prot(av[optind+1], parallel, warmup, repetitions); + micro("Protection fault", get_n()); + } else { + lmbench_usage(ac, av, usage); + } + return(0); +} diff --git a/performance/lmbench3/src/lat_syscall.c b/performance/lmbench3/src/lat_syscall.c new file mode 100644 index 0000000..9f30622 --- /dev/null +++ b/performance/lmbench3/src/lat_syscall.c @@ -0,0 +1,175 @@ +/* + * lat_syscall.c - time simple system calls + * + * Copyright (c) 1996 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + */ +char *id = "$Id: s.lat_syscall.c 1.11 97/06/15 22:38:58-07:00 lm $\n"; + +#include "bench.h" +#define FNAME "/usr/include/sys/types.h" + +struct _state { + int fd; + char* file; +}; + +void +do_getppid(iter_t iterations, void *cookie) +{ + struct _state *pState = (struct _state*)cookie; + char c; + + while (iterations-- > 0) { + getppid(); + } +} + +void +do_write(iter_t iterations, void *cookie) +{ + struct _state *pState = (struct _state*)cookie; + char c; + + while (iterations-- > 0) { + if (write(pState->fd, &c, 1) != 1) { + perror("/dev/null"); + return; + } + } +} + +void +do_read(iter_t iterations, void *cookie) +{ + struct _state *pState = (struct _state*)cookie; + char c; + + while (iterations-- > 0) { + if (read(pState->fd, &c, 1) != 1) { + perror("/dev/zero"); + return; + } + } +} + +void +do_stat(iter_t iterations, void *cookie) +{ + struct _state *pState = (struct _state*)cookie; + struct stat sbuf; + + while (iterations-- > 0) { + if (stat(pState->file, &sbuf) == -1) { + perror(pState->file); + return; + } + } +} + +void +do_fstat(iter_t iterations, void *cookie) +{ + struct _state *pState = (struct _state*)cookie; + struct stat sbuf; + + while (iterations-- > 0) { + if (fstat(pState->fd, &sbuf) == -1) { + perror("fstat"); + return; + } + } +} + +void +do_openclose(iter_t iterations, void *cookie) +{ + struct _state *pState = (struct _state*)cookie; + int fd; + + while (iterations-- > 0) { + fd = open(pState->file, 0); + if (fd == -1) { + perror(pState->file); + return; + } + close(fd); + } +} + +int +main(int ac, char **av) +{ + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + struct _state state; + char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] null|read|write|stat|fstat|open [file]\n"; + + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind != ac - 1 && optind != ac - 2 ) { + lmbench_usage(ac, av, usage); + } + + state.file = FNAME; + if (optind == ac - 2) + state.file = av[optind + 1]; + + if (!strcmp("null", av[optind])) { + benchmp(NULL, do_getppid, NULL, 0, parallel, + warmup, repetitions, &state); + micro("Simple syscall", get_n()); + } else if (!strcmp("write", av[optind])) { + state.fd = open("/dev/null", 1); + benchmp(NULL, do_write, NULL, 0, parallel, + warmup, repetitions, &state); + micro("Simple write", get_n()); + close(state.fd); + } else if (!strcmp("read", av[optind])) { + state.fd = open("/dev/zero", 0); + if (state.fd == -1) { + fprintf(stderr, "Simple read: -1\n"); + return(1); + } + benchmp(NULL, do_read, NULL, 0, parallel, + warmup, repetitions, &state); + micro("Simple read", get_n()); + close(state.fd); + } else if (!strcmp("stat", av[optind])) { + benchmp(NULL, do_stat, NULL, 0, parallel, + warmup, repetitions, &state); + micro("Simple stat", get_n()); + } else if (!strcmp("fstat", av[optind])) { + state.fd = open(state.file, 0); + benchmp(NULL, do_fstat, NULL, 0, parallel, + warmup, repetitions, &state); + micro("Simple fstat", get_n()); + close(state.fd); + } else if (!strcmp("open", av[optind])) { + benchmp(NULL, do_openclose, NULL, 0, parallel, + warmup, repetitions, &state); + micro("Simple open/close", get_n()); + } else { + lmbench_usage(ac, av, usage); + } + return(0); +} diff --git a/performance/lmbench3/src/lat_tcp.c b/performance/lmbench3/src/lat_tcp.c new file mode 100644 index 0000000..cf4d145 --- /dev/null +++ b/performance/lmbench3/src/lat_tcp.c @@ -0,0 +1,175 @@ +/* + * lat_tcp.c - simple TCP transaction latency test + * + * Three programs in one - + * server usage: tcp_xact -s + * client usage: tcp_xact [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname + * shutdown: tcp_xact -S hostname + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +typedef struct _state { + int msize; + int sock; + char *server; + char *buf; +} state_t; + +void init(iter_t iterations, void* cookie); +void cleanup(iter_t iterations, void* cookie); +void doclient(iter_t iterations, void* cookie); +void server_main(); +void doserver(int sock); + +int +main(int ac, char **av) +{ + state_t state; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char buf[256]; + char *usage = "-s\n OR [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] server\n OR -S server\n"; + + state.msize = 1; + + while (( c = getopt(ac, av, "sS:m:P:W:N:")) != EOF) { + switch(c) { + case 's': /* Server */ + if (fork() == 0) { + server_main(); + } + exit(0); + case 'S': /* shutdown serverhost */ + state.sock = tcp_connect(optarg, + TCP_XACT, + SOCKOPT_NONE); + close(state.sock); + exit(0); + case 'm': + state.msize = atoi(optarg); + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) + lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind != ac - 1) { + lmbench_usage(ac, av, usage); + } + + state.server = av[optind]; + benchmp(init, doclient, cleanup, MEDIUM, parallel, + warmup, repetitions, &state); + + sprintf(buf, "TCP latency using %s", state.server); + micro(buf, get_n()); + + exit(0); +} + +void +init(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + int msize = htonl(state->msize); + + if (iterations) return; + + state->sock = tcp_connect(state->server, TCP_XACT, SOCKOPT_NONE); + state->buf = malloc(state->msize); + + write(state->sock, &msize, sizeof(int)); +} + +void +cleanup(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + close(state->sock); + free(state->buf); +} + +void +doclient(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + int sock = state->sock; + + while (iterations-- > 0) { + write(sock, state->buf, state->msize); + read(sock, state->buf, state->msize); + } +} + +void +server_main() +{ + int newsock, sock; + + GO_AWAY; + signal(SIGCHLD, sigchld_wait_handler); + sock = tcp_server(TCP_XACT, SOCKOPT_REUSE); + for (;;) { + newsock = tcp_accept(sock, SOCKOPT_NONE); + switch (fork()) { + case -1: + perror("fork"); + break; + case 0: + doserver(newsock); + exit(0); + default: + close(newsock); + break; + } + } + /* NOTREACHED */ +} + +void +doserver(int sock) +{ + int n; + + if (read(sock, &n, sizeof(int)) == sizeof(int)) { + int msize = ntohl(n); + char* buf = (char*)malloc(msize); + + for (n = 0; read(sock, buf, msize) > 0; n++) { + write(sock, buf, msize); + } + free(buf); + } else { + /* + * A connection with no data means shut down. + */ + tcp_done(TCP_XACT); + kill(getppid(), SIGTERM); + exit(0); + } +} diff --git a/performance/lmbench3/src/lat_udp.c b/performance/lmbench3/src/lat_udp.c new file mode 100644 index 0000000..cd4be23 --- /dev/null +++ b/performance/lmbench3/src/lat_udp.c @@ -0,0 +1,207 @@ +/* + * udp_xact.c - simple UDP transaction latency test + * + * Three programs in one - + * server usage: lat_udp -s + * client usage: lat_udp [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname + * shutdown: lat_udp -S hostname + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; +#include "bench.h" + +#define MAX_MSIZE (10 * 1024 * 1024) + +void client_main(int ac, char **av); +void server_main(); +void timeout(); +void init(iter_t iterations, void* cookie); +void cleanup(iter_t iterations, void* cookie); +void doit(iter_t iterations, void* cookie); + +typedef struct _state { + int sock; + int seq; + int msize; + char *server; + char *buf; +} state_t; + + +int +main(int ac, char **av) +{ + state_t state; + int c; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int server = 0; + int shutdown = 0; + int msize = 4; + char buf[256]; + char *usage = "-s\n OR [-S] [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] server\n NOTE: message size must be >= 4\n"; + + if (sizeof(int) != 4) { + fprintf(stderr, "lat_udp: Wrong sequence size\n"); + return(1); + } + + while (( c = getopt(ac, av, "sS:m:P:W:N:")) != EOF) { + switch(c) { + case 's': /* Server */ + if (fork() == 0) { + server_main(); + } + exit(0); + case 'S': /* shutdown serverhost */ + { + int seq, n; + int sock = udp_connect(optarg, + UDP_XACT, + SOCKOPT_NONE); + for (n = -1; n > -5; --n) { + seq = htonl(n); + (void) send(sock, &seq, sizeof(int), 0); + } + close(sock); + exit (0); + } + case 'm': + msize = atoi(optarg); + if (msize < sizeof(int)) { + lmbench_usage(ac, av, usage); + msize = 4; + } + if (msize > MAX_MSIZE) { + lmbench_usage(ac, av, usage); + msize = MAX_MSIZE; + } + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) + lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind + 1 != ac) { + lmbench_usage(ac, av, usage); + } + + state.server = av[optind]; + state.msize = msize; + benchmp(init, doit, cleanup, SHORT, parallel, + warmup, repetitions, &state); + sprintf(buf, "UDP latency using %s", state.server); + micro(buf, get_n()); + exit(0); +} + +void +init(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + state->sock = udp_connect(state->server, UDP_XACT, SOCKOPT_NONE); + state->seq = 0; + state->buf = (char*)malloc(state->msize); + + signal(SIGALRM, timeout); + alarm(15); +} + +void +doit(iter_t iterations, void *cookie) +{ + state_t *state = (state_t *) cookie; + int seq = state->seq; + int net = htonl(seq); + int sock = state->sock; + int ret; + + alarm(15); + while (iterations-- > 0) { + *(int*)state->buf = htonl(seq++); + if (send(sock, state->buf, state->msize, 0) != state->msize) { + perror("lat_udp client: send failed"); + exit(5); + } + if (recv(sock, state->buf, state->msize, 0) != state->msize) { + perror("lat_udp client: recv failed"); + exit(5); + } + } + state->seq = seq; +} + +void +cleanup(iter_t iterations, void* cookie) +{ + state_t *state = (state_t *) cookie; + + if (iterations) return; + + close(state->sock); + free(state->buf); +} + +void +timeout() +{ + fprintf(stderr, "Recv timed out\n"); + exit(1); +} + +void +server_main() +{ + char *buf = (char*)valloc(MAX_MSIZE); + int sock, sent, namelen, seq = 0; + struct sockaddr_in it; + + GO_AWAY; + + sock = udp_server(UDP_XACT, SOCKOPT_REUSE); + + while (1) { + int nbytes; + namelen = sizeof(it); + if ((nbytes = recvfrom(sock, (void*)buf, MAX_MSIZE, 0, + (struct sockaddr*)&it, &namelen)) < 0) { + fprintf(stderr, "lat_udp server: recvfrom: got wrong size\n"); + exit(9); + } + sent = ntohl(*(int*)buf); + if (sent < 0) { + udp_done(UDP_XACT); + exit(0); + } + if (sent != ++seq) { + seq = sent; + } + *(int*)buf = htonl(seq); + if (sendto(sock, (void*)buf, nbytes, 0, + (struct sockaddr*)&it, sizeof(it)) < 0) { + perror("lat_udp sendto"); + exit(9); + } + } +} diff --git a/performance/lmbench3/src/lat_unix.c b/performance/lmbench3/src/lat_unix.c new file mode 100644 index 0000000..1e321f8 --- /dev/null +++ b/performance/lmbench3/src/lat_unix.c @@ -0,0 +1,130 @@ +/* + * tcp_unix.c - simple UNIX socket transaction latency test + * + * lat_unix [-P <parallelism>] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 1994-2000 Carl Staelin and Larry McVoy. + * Distributed under the FSF GPL with additional restriction that + * results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; +#include "bench.h" + +struct _state { + int sv[2]; + int pid; + int msize; + char* buf; +}; +void initialize(iter_t iterations, void* cookie); +void benchmark(iter_t iterations, void* cookie); +void cleanup(iter_t iterations, void* cookie); + +int +main(int ac, char **av) +{ + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + struct _state state; + int c; + char* usage = "[-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; + + state.msize = 1; + state.pid = 0; + + while (( c = getopt(ac, av, "m:P:W:N:")) != EOF) { + switch(c) { + case 'm': + state.msize = atoi(optarg); + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind < ac) { + lmbench_usage(ac, av, usage); + } + + benchmp(initialize, benchmark, cleanup, 0, parallel, + warmup, repetitions, &state); + + micro("AF_UNIX sock stream latency", get_n()); + return(0); +} + +void +initialize(iter_t iterations, void* cookie) +{ + struct _state* pState = (struct _state*)cookie; + void exit(); + + if (iterations) return; + + if (socketpair(AF_UNIX, SOCK_STREAM, 0, pState->sv) == -1) { + perror("socketpair"); + } + + pState->buf = malloc(pState->msize); + if (pState->buf == NULL) { + fprintf(stderr, "buffer allocation\n"); + exit(1); + } + handle_scheduler(benchmp_childid(), 0, 1); + + if (pState->pid = fork()) + return; + + handle_scheduler(benchmp_childid(), 1, 1); + + /* Child sits and ping-pongs packets back to parent */ + signal(SIGTERM, exit); + while (read(pState->sv[0], pState->buf, pState->msize) == pState->msize) { + write(pState->sv[0], pState->buf, pState->msize); + } + exit(0); +} + +void +benchmark(iter_t iterations, void* cookie) +{ + struct _state* pState = (struct _state*)cookie; + + while (iterations-- > 0) { + if (write(pState->sv[1], pState->buf, pState->msize) != pState->msize + || read(pState->sv[1], pState->buf, pState->msize) != pState->msize) { + /* error handling: how do we signal failure? */ + cleanup(0, cookie); + exit(0); + } + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + struct _state* pState = (struct _state*)cookie; + + if (iterations) return; + + if (pState->pid) { + kill(pState->pid, SIGKILL); + waitpid(pState->pid, NULL, 0); + pState->pid = 0; + } +} + diff --git a/performance/lmbench3/src/lat_unix_connect.c b/performance/lmbench3/src/lat_unix_connect.c new file mode 100644 index 0000000..46e1876 --- /dev/null +++ b/performance/lmbench3/src/lat_unix_connect.c @@ -0,0 +1,102 @@ +/* + * lat_unix_connect.c - simple UNIX connection latency test + * + * Three programs in one - + * server usage: lat_connect -s + * client usage: lat_connect [-P <parallelism>] [-W <warmup>] [-N <repetitions>] + * shutdown: lat_connect -q + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; +#include "bench.h" + +#define CONNAME "/tmp/af_unix" + +void server_main(void); + +void benchmark(iter_t iterations, void* cookie) +{ + while (iterations-- > 0) { + int sock = unix_connect(CONNAME); + if (sock <= 0) + printf("error on iteration %lu\n",iterations); + close(sock); + } +} + +int main(int ac, char **av) +{ + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + char *usage = "-s\n OR [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n OR -S\n"; + int c; + + /* Start the server "-s" or Shut down the server "-S" */ + if (ac == 2) { + if (!strcmp(av[1], "-s")) { + if (fork() == 0) { + server_main(); + } + exit(0); + } + if (!strcmp(av[1], "-S")) { + int sock = unix_connect(CONNAME); + write(sock, "0", 1); + close(sock); + exit(0); + } + } + + /* + * Rest is client + */ + while (( c = getopt(ac, av, "P:W:N:")) != EOF) { + switch(c) { + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (optind != ac) { + lmbench_usage(ac, av, usage); + } + + benchmp(NULL, benchmark, NULL, 0, parallel, warmup, repetitions, NULL); + micro("UNIX connection cost", get_n()); +} + +void server_main(void) +{ + int newsock, sock; + char c; + + GO_AWAY; + sock = unix_server(CONNAME); + for (;;) { + newsock = unix_accept(sock); + c = 0; + read(newsock, &c, 1); + if (c && c == '0') { + unix_done(sock, CONNAME); + exit(0); + } + close(newsock); + } +} diff --git a/performance/lmbench3/src/lat_usleep.c b/performance/lmbench3/src/lat_usleep.c new file mode 100755 index 0000000..d18d0c8 --- /dev/null +++ b/performance/lmbench3/src/lat_usleep.c @@ -0,0 +1,259 @@ +/* + * lat_usleep.c - usleep duration/latency + * + * The APIs for usleep(3), nanosleep(2), select(2), pselect(2), + * getitimer(2) and setitimer(2) support resolutions down to + * a micro-second. However, many implementations do not support + * such resolution. Most current implementations (as of Fall + * 2002) simply put the process back on the run queue and the + * process may get run on the next scheduler time slice (10-20 + * milli-second resolution). + * + * This benchmark measures the true latency from the timer/sleep + * call to the resumption of program execution. If the timers + * actually worked properly, then the latency would be identical + * to the requested duration, or a little longer, so the input + * and output figures would be nearly identical. In most current + * implementations the value is rounded up to the next scheduler + * timeslice (e.g., a resolution of 20 milli-seconds, with all + * values rounded up). + * + * usage: lat_usleep [-u | -i] [-P <parallelism>] [-W <warmup>] \ + * [-N <repetitions>] usecs + * + * Copyright (c) 2002 Carl Staelin. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" +#include <sched.h> + +typedef enum {USLEEP, NANOSLEEP, SELECT, PSELECT, ITIMER} timer_e; + +uint64 caught, + n; +struct itimerval value; + +typedef struct _state { + unsigned long usecs; +} state_t; + +void +bench_usleep(iter_t iterations, void *cookie) +{ + state_t *state = (state_t*)cookie; + + while (iterations-- > 0) { + usleep(state->usecs); + } +} + +void +bench_nanosleep(iter_t iterations, void *cookie) +{ + state_t *state = (state_t*)cookie; + struct timespec req; + struct timespec rem; + + req.tv_sec = 0; + req.tv_nsec = state->usecs * 1000; + + while (iterations-- > 0) { + if (nanosleep(&req, &rem) < 0) { + while (nanosleep(&rem, &rem) < 0) + ; + } + } +} + +void +bench_select(iter_t iterations, void *cookie) +{ + state_t *state = (state_t*)cookie; + struct timeval tv; + + while (iterations-- > 0) { + tv.tv_sec = 0; + tv.tv_usec = state->usecs; + select(0, NULL, NULL, NULL, &tv); + } +} + +#ifdef _POSIX_SELECT +void +bench_pselect(iter_t iterations, void *cookie) +{ + state_t *state = (state_t*)cookie; + struct timespec ts; + + while (iterations-- > 0) { + ts.tv_sec = 0; + ts.tv_nsec = state->usecs * 1000; + pselect(0, NULL, NULL, NULL, &ts, NULL); + } +} +#endif /* _POSIX_SELECT */ + +void +interval() +{ + if (++caught == n) { + caught = 0; + n = benchmp_interval(benchmp_getstate()); + } + + setitimer(ITIMER_REAL, &value, NULL); +} + +void +initialize(iter_t iterations, void *cookie) +{ + state_t *state = (state_t*)cookie; + struct sigaction sa; + + if (iterations) return; + + value.it_interval.tv_sec = 0; + value.it_interval.tv_usec = state->usecs; + value.it_value.tv_sec = 0; + value.it_value.tv_usec = state->usecs; + + sa.sa_handler = interval; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sigaction(SIGALRM, &sa, 0); +} + +void +bench_itimer(iter_t iterations, void *cookie) +{ + n = iterations; + caught = 0; + + /* + * start the first timing interval + */ + start(0); + + /* + * create the first timer, causing us to jump to interval() + */ + setitimer(ITIMER_REAL, &value, NULL); + + while (1) { + sleep(100000); + } +} + +int +set_realtime() +{ + struct sched_param sp; + + sp.sched_priority = sched_get_priority_max(SCHED_RR); + if (sched_setscheduler(0, SCHED_RR, &sp) >= 0) return TRUE; + perror("sched_setscheduler"); + return FALSE; +} + +int +main(int ac, char **av) +{ + int realtime = 0; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + char buf[512]; + timer_e what = USLEEP; + state_t state; + char *scheduler = ""; + char *mechanism = "usleep"; + char *usage = "[-r] [-u <method>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] usecs\nmethod=usleep|nanosleep|select|pselect|itimer\n"; + + realtime = 0; + + while ((c = getopt(ac, av, "ru:W:N:")) != EOF) { + switch (c) { + case 'r': + realtime = 1; + break; + case 'u': + if (strcmp(optarg, "usleep") == 0) { + what = USLEEP; + mechanism = "usleep"; + } else if (strcmp(optarg, "nanosleep") == 0) { + what = NANOSLEEP; + mechanism = "nanosleep"; + } else if (strcmp(optarg, "select") == 0) { + what = SELECT; + mechanism = "select"; +#ifdef _POSIX_SELECT + } else if (strcmp(optarg, "pselect") == 0) { + what = PSELECT; + mechanism = "pselect"; +#endif /* _POSIX_SELECT */ + } else if (strcmp(optarg, "itimer") == 0) { + what = ITIMER; + mechanism = "itimer"; + } else { + lmbench_usage(ac, av, usage); + } + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + if (optind != ac - 1) { + lmbench_usage(ac, av, usage); + } + + state.usecs = bytes(av[optind]); + if (realtime && set_realtime()) scheduler = "realtime "; + + switch (what) { + case USLEEP: + benchmp(NULL, bench_usleep, NULL, + 0, parallel, warmup, repetitions, &state); + break; + case NANOSLEEP: + benchmp(NULL, bench_nanosleep, NULL, + 0, parallel, warmup, repetitions, &state); + break; + case SELECT: + benchmp(NULL, bench_select, NULL, + 0, parallel, warmup, repetitions, &state); + break; +#ifdef _POSIX_SELECT + case PSELECT: + benchmp(NULL, bench_pselect, NULL, + 0, parallel, warmup, repetitions, &state); + break; +#endif /* _POSIX_SELECT */ + case ITIMER: + benchmp(initialize, bench_itimer, NULL, + 0, parallel, warmup, repetitions, &state); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + sprintf(buf, "%s%s %lu microseconds", scheduler, mechanism, state.usecs); + micro(buf, get_n()); + return (0); +} diff --git a/performance/lmbench3/src/lib_debug.c b/performance/lmbench3/src/lib_debug.c new file mode 100644 index 0000000..2be1852 --- /dev/null +++ b/performance/lmbench3/src/lib_debug.c @@ -0,0 +1,131 @@ +#include <math.h> +#include "bench.h" +#include "lib_debug.h" + +/* + * return micro-seconds / iteration at the the fraction point. + * + * some examples: + * min = percent_point(values, size, 0.0) + * 1st quartile = percent_point(values, size, 0.25) + * median = percent_point(values, size, 0.5) + * 3rd quartile = percent_point(values, size, 0.75) + * max = percent_point(values, size, 1.0) + * + * the data points in the results structure are sorted from + * largest to smallest, so we adjust the fraction accordingly. + */ +double +percent_point(double fraction) +{ + double t, r; + result_t* results = get_results(); + + t = (1.0 - fraction) * (results->N - 1); + if (t == floor(t)) { + /* no interpolation */ + r = results->v[(int)t].u / (double)results->v[(int)t].n; + } else { + /* percent point falls between two points, interpolate */ + r = results->v[(int)t].u / (double)results->v[(int)t].n; + r += results->v[(int)t+1].u / (double)results->v[(int)t+1].n; + r /= 2.0; + } + + return r; +} + +void +print_results(int details) +{ + int i; + result_t* results = get_results(); + + fprintf(stderr, "N=%d, t={", results->N); + for (i = 0; i < results->N; ++i) { + fprintf(stderr, "%.2f", (double)results->v[i].u/results->v[i].n); + if (i < results->N - 1) + fprintf(stderr, ", "); + } + fprintf(stderr, "}\n"); + if (details) { + fprintf(stderr, "\t/* {", results->N); + for (i = 0; i < results->N; ++i) { + fprintf(stderr, + "%llu/%llu", results->v[i].u, results->v[i].n); + if (i < results->N - 1) + fprintf(stderr, ", "); + } + fprintf(stderr, "} */\n"); + } + +} + +/* + * Prints bandwidth (MB/s) quartile information + * + * bytes - bytes per iteration + */ +void +bw_quartile(uint64 bytes) +{ + double b = (double)bytes; + + fprintf(stderr, "%d\t%e\t%e\t%e\t%e\t%e\n", get_n(), + (double)bytes / (1000000. * percent_point(0.00)), + (double)bytes / (1000000. * percent_point(0.25)), + (double)bytes / (1000000. * percent_point(0.50)), + (double)bytes / (1000000. * percent_point(0.75)), + (double)bytes / (1000000. * percent_point(1.00))); +} + +/* + * Prints latency (nano-seconds) quartile information + * + * n - number of operations per iteration + */ +void +nano_quartile(uint64 n) +{ + fprintf(stderr, "%d\t%e\t%e\t%e\t%e\t%e\n", get_n(), + percent_point(0.00) * 1000. / (double)n, + percent_point(0.25) * 1000. / (double)n, + percent_point(0.50) * 1000. / (double)n, + percent_point(0.75) * 1000. / (double)n, + percent_point(1.00) * 1000. / (double)n); +} + +/* + * print the page|line|word offset for each link in the pointer chain. + */ +void +print_mem(char* addr, size_t size, size_t line) +{ + char* p; + uint64 base, off; + size_t pagesize = getpagesize(); + + base = (uint64)addr; + for (p = addr; *(char**)p != addr; p = *(char**)p) { + off = (uint64)p - base; + fprintf(stderr, "\t%lu\t%lu\t%lu\n", off / pagesize, + (off % pagesize) / line, (off % line) / sizeof(char*)); + } +} + +void +check_mem(char* addr, size_t size) +{ + char* p; + size_t i; + size_t max = size / sizeof(char*) + 1; + + for (p=addr, i=0; *(char**)p != addr && i < max; p = *(char**)p, i++) { + if (p < addr || addr + size <= p) { + fprintf(stderr, "check_mem: pointer out of range!\n"); + } + } + if (*(char**)p != addr) { + fprintf(stderr, "check_mem: pointer chain doesn't loop\n"); + } +} diff --git a/performance/lmbench3/src/lib_debug.h b/performance/lmbench3/src/lib_debug.h new file mode 100644 index 0000000..3e1b682 --- /dev/null +++ b/performance/lmbench3/src/lib_debug.h @@ -0,0 +1,10 @@ +#ifndef _LIB_DEBUG_H +#define _LIB_DEBUG_H + +void print_results(int details); +void bw_quartile(uint64 bytes); +void nano_quartile(uint64 n); +void print_mem(char* addr, size_t size, size_t line); +void check_mem(char* addr, size_t size); + +#endif /* _LIB_DEBUG_H */ diff --git a/performance/lmbench3/src/lib_mem.c b/performance/lmbench3/src/lib_mem.c new file mode 100644 index 0000000..3bdd4dc --- /dev/null +++ b/performance/lmbench3/src/lib_mem.c @@ -0,0 +1,699 @@ +/* + * lib_mem.c - library of routines used to analyze the memory hierarchy + * + * @(#)lib_mem.c 1.15 staelin@xxxxxxxxxxxxxxxxxxxxxxxx + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. + * Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ + +#include "bench.h" + +#define FIVE(m) m m m m m +#define TEN(m) FIVE(m) FIVE(m) +#define FIFTY(m) TEN(m) TEN(m) TEN(m) TEN(m) TEN(m) +#define HUNDRED(m) FIFTY(m) FIFTY(m) + +#define DEREF(N) p##N = (char**)*p##N; +#define DECLARE(N) static char **sp##N; register char **p##N; +#define INIT(N) p##N = (mem_benchmark_rerun && addr_save==state->addr) ? sp##N : (char**)state->p[N]; +#define SAVE(N) sp##N = p##N; + +#define MEM_BENCHMARK_F(N) mem_benchmark_##N, +benchmp_f mem_benchmarks[] = {REPEAT_15(MEM_BENCHMARK_F)}; + +static int mem_benchmark_rerun = 0; + +#define MEM_BENCHMARK_DEF(N,repeat,body) \ +void \ +mem_benchmark_##N(iter_t iterations, void *cookie) \ +{ \ + struct mem_state* state = (struct mem_state*)cookie; \ + static char *addr_save = NULL; \ + repeat(DECLARE); \ + \ + repeat(INIT); \ + while (iterations-- > 0) { \ + HUNDRED(repeat(body)); \ + } \ + \ + repeat(SAVE); \ + addr_save = state->addr; \ + mem_benchmark_rerun = 1; \ +} + +MEM_BENCHMARK_DEF(0, REPEAT_0, DEREF) +MEM_BENCHMARK_DEF(1, REPEAT_1, DEREF) +MEM_BENCHMARK_DEF(2, REPEAT_2, DEREF) +MEM_BENCHMARK_DEF(3, REPEAT_3, DEREF) +MEM_BENCHMARK_DEF(4, REPEAT_4, DEREF) +MEM_BENCHMARK_DEF(5, REPEAT_5, DEREF) +MEM_BENCHMARK_DEF(6, REPEAT_6, DEREF) +MEM_BENCHMARK_DEF(7, REPEAT_7, DEREF) +MEM_BENCHMARK_DEF(8, REPEAT_8, DEREF) +MEM_BENCHMARK_DEF(9, REPEAT_9, DEREF) +MEM_BENCHMARK_DEF(10, REPEAT_10, DEREF) +MEM_BENCHMARK_DEF(11, REPEAT_11, DEREF) +MEM_BENCHMARK_DEF(12, REPEAT_12, DEREF) +MEM_BENCHMARK_DEF(13, REPEAT_13, DEREF) +MEM_BENCHMARK_DEF(14, REPEAT_14, DEREF) +MEM_BENCHMARK_DEF(15, REPEAT_15, DEREF) + + +size_t* words_initialize(size_t max, int scale); + + +void +mem_reset() +{ + mem_benchmark_rerun = 0; +} + +void +mem_cleanup(iter_t iterations, void* cookie) +{ + struct mem_state* state = (struct mem_state*)cookie; + + if (iterations) return; + + if (state->addr) { + free(state->addr); + state->addr = NULL; + } + if (state->lines) { + free(state->lines); + state->lines = NULL; + } + if (state->pages) { + free(state->pages); + state->pages = NULL; + } + if (state->words) { + free(state->words); + state->words = NULL; + } +} + +void +tlb_cleanup(iter_t iterations, void* cookie) +{ + size_t i; + struct mem_state* state = (struct mem_state*)cookie; + char **addr = (char**)state->addr; + + if (iterations) return; + + if (addr) { + for (i = 0; i < state->npages; ++i) { + if (addr[i]) free(addr[i]); + } + free(addr); + state->addr = NULL; + } + if (state->pages) { + free(state->pages); + state->pages = NULL; + } + if (state->lines) { + free(state->lines); + state->lines = NULL; + } +} + +void +base_initialize(iter_t iterations, void* cookie) +{ + int nwords, nlines, nbytes, npages, nmpages; + size_t *pages; + size_t *lines; + size_t *words; + struct mem_state* state = (struct mem_state*)cookie; + register char *p = 0 /* lint */; + + if (iterations) return; + + state->initialized = 0; + + nbytes = state->len; + nwords = state->line / sizeof(char*); + nlines = state->pagesize / state->line; + npages = (nbytes + state->pagesize - 1) / state->pagesize; + nmpages= (state->maxlen + state->pagesize - 1) / state->pagesize; + + srand(getpid()); + + words = NULL; + lines = NULL; + pages = permutation(nmpages, state->pagesize); + p = state->addr = (char*)malloc(state->maxlen + 2 * state->pagesize); + + state->nwords = nwords; + state->nlines = nlines; + state->npages = npages; + state->lines = lines; + state->pages = pages; + state->words = words; + + if (state->addr == NULL || pages == NULL) + return; + + if ((unsigned long)p % state->pagesize) { + p += state->pagesize - (unsigned long)p % state->pagesize; + } + state->base = p; + state->initialized = 1; + mem_reset(); +} + +/* + * Create a circular list of pointers using a simple striding + * algorithm. + * + * This access pattern corresponds to many array/matrix + * algorithms. It should be easily and correctly predicted + * by any decent hardware prefetch algorithm. + */ +void +stride_initialize(iter_t iterations, void* cookie) +{ + struct mem_state* state = (struct mem_state*)cookie; + size_t i; + size_t range = state->len; + size_t stride = state->line; + char* addr; + + base_initialize(iterations, cookie); + if (!state->initialized) return; + addr = state->base; + + for (i = stride; i < range; i += stride) { + *(char **)&addr[i - stride] = (char*)&addr[i]; + } + *(char **)&addr[i - stride] = (char*)&addr[0]; + state->p[0] = addr; + mem_reset(); +} + +void +thrash_initialize(iter_t iterations, void* cookie) +{ + struct mem_state* state = (struct mem_state*)cookie; + size_t i; + size_t j; + size_t cur; + size_t next; + size_t cpage; + size_t npage; + char* addr; + + base_initialize(iterations, cookie); + if (!state->initialized) return; + addr = state->base; + + /* + * Create a circular list of pointers with a random access + * pattern. + * + * This stream corresponds more closely to linked list + * memory access patterns. For large data structures each + * access will likely cause both a cache miss and a TLB miss. + * + * Access a different page each time. This will eventually + * cause a tlb miss each page. It will also cause maximal + * thrashing in the cache between the user data stream and + * the page table entries. + */ + if (state->len % state->pagesize) { + state->nwords = state->len / state->line; + state->words = words_initialize(state->nwords, state->line); + for (i = 0; i < state->nwords - 1; ++i) { + *(char **)&addr[state->words[i]] = (char*)&addr[state->words[i+1]]; + } + *(char **)&addr[state->words[i]] = addr; + state->p[0] = addr; + } else { + state->nwords = state->pagesize / state->line; + state->words = words_initialize(state->nwords, state->line); + + for (i = 0; i < state->npages - 1; ++i) { + cpage = state->pages[i]; + npage = state->pages[i + 1]; + for (j = 0; j < state->nwords; ++j) { + cur = cpage + state->words[(i + j) % state->nwords]; + next = npage + state->words[(i + j + 1) % state->nwords]; + *(char **)&addr[cur] = (char*)&addr[next]; + } + } + cpage = state->pages[i]; + npage = state->pages[0]; + for (j = 0; j < state->nwords; ++j) { + cur = cpage + state->words[(i + j) % state->nwords]; + next = npage + state->words[(j + 1) % state->nwords]; + *(char **)&addr[cur] = (char*)&addr[next]; + } + state->p[0] = (char*)&addr[state->pages[0]]; + } + mem_reset(); +} + +/* + * mem_initialize + * + * Create a circular pointer chain that runs through memory. + * + * The chain threads through each cache line on a page before + * moving to the next page. The order of cache line accesses + * is randomized to defeat cache prefetching algorithms. In + * addition, the order of page accesses is randomized. Finally, + * to reduce the impact of incorrect line-size estimates on + * machines with direct-mapped caches, we randomize which + * word in the cache line is used to hold the pointer. + * + * It initializes state->width pointers to elements evenly + * spaced through the chain. + */ +void +mem_initialize(iter_t iterations, void* cookie) +{ + int i, j, k, l, np, nw, nwords, nlines, nbytes, npages, npointers; + unsigned int r; + size_t *pages; + size_t *lines; + size_t *words; + struct mem_state* state = (struct mem_state*)cookie; + register char *p = 0 /* lint */; + + if (iterations) return; + + base_initialize(iterations, cookie); + if (!state->initialized) return; + state->initialized = 0; + + npointers = state->len / state->line; + nwords = state->nwords; + nlines = state->nlines; + npages = state->npages; + words = state->words = words_initialize(nwords, sizeof(char*)); + lines = state->lines = words_initialize(nlines, state->line); + pages = state->pages; + p = state->base; + + if (state->addr == NULL \ + || pages == NULL || lines == NULL || words == NULL) { + return; + } + + /* setup the run through the pages */ + l = 0; + for (i = 0; i < npages; ++i) { + for (j = 0; j < nlines - 1 && l < npointers - 1; ++j, ++l) { + for (k = 0; k < state->line; k += sizeof(char*)) { + *(char**)(p + pages[i] + lines[j] + k) = + p + pages[i] + lines[j+1] + k; + } + if (l % (npointers/state->width) == 0 + && l / (npointers/state->width) < MAX_MEM_PARALLELISM) { + k = l / (npointers/state->width); + state->p[k] = p + pages[i] + lines[j] + words[k % nwords]; + } + } + + if (i < npages - 1) { + for (k = 0; k < nwords; ++k) + *(char**)(p + pages[i] + lines[j] + words[k]) = + p + pages[i+1] + lines[0] + words[k]; + } + } + for (k = 0; k < nwords; ++k) { + nw = (k == nwords - 1) ? 0 : k + 1; + *(char**)(p + pages[npages-1] + lines[j] + words[k]) = + p + pages[0] + lines[0] + words[nw]; + } + + /* now, run through the chain once to clear the cache */ + mem_reset(); + (*mem_benchmarks[state->width-1])((nwords * npointers + 100) / 100, state); + + state->initialized = 1; +} + +/* + * line_initialize + * + * This is very similar to mem_initialize, except that we always use + * the first element of the cache line to hold the pointer. + * + */ +void +line_initialize(iter_t iterations, void* cookie) +{ + int i, j, k, line, nlines, npages; + unsigned int r; + size_t *pages; + size_t *lines; + struct mem_state* state = (struct mem_state*)cookie; + register char *p = 0 /* lint */; + + if (iterations) return; + + base_initialize(iterations, cookie); + if (!state->initialized) return; + state->initialized = 0; + + nlines = state->nlines; + npages = state->npages; + lines = state->lines = words_initialize(nlines, state->line); + pages = state->pages; + p = state->base; + + state->width = 1; + + if (state->addr == NULL || lines == NULL || pages == NULL) + return; + + /* new setup runs through the lines */ + for (i = 0; i < npages; ++i) { + /* sequence through the first word of each line */ + for (j = 0; j < nlines - 1; ++j) { + *(char**)(p + pages[i] + lines[j]) = + p + pages[i] + lines[j+1]; + } + + /* jump to the fist word of the first line on next page */ + *(char**)(p + pages[i] + lines[j]) = + p + pages[(i < npages-1) ? i+1 : 0] + lines[0]; + } + state->p[0] = p + pages[0] + lines[0]; + + /* now, run through the chain once to clear the cache */ + mem_reset(); + mem_benchmark_0((nlines * npages + 100) / 100, state); + + state->initialized = 1; +} + +/* + * tlb_initialize + * + * Build a pointer chain which accesses one word per page, for a total + * of (line * pages) bytes of data loaded into cache. + * + * If the number of elements in the chain (== #pages) is larger than the + * number of pages addressed by the TLB, then each access should cause + * a TLB miss (certainly as the number of pages becomes much larger than + * the TLB-addressed space). + * + * In addition, if we arrange the chain properly, each word we access + * will be in the cache. + * + * This means that the average access time for each pointer dereference + * should be a cache hit plus a TLB miss. + * + */ +void +tlb_initialize(iter_t iterations, void* cookie) +{ + int i, j, nwords, nlines, npages, pagesize; + unsigned int r; + char **pages = NULL; + char **addr = NULL; + size_t *lines = NULL; + struct mem_state* state = (struct mem_state*)cookie; + register char *p = 0 /* lint */; + + if (iterations) return; + + state->initialized = 0; + + pagesize = state->pagesize; + nwords = 0; + nlines = pagesize / sizeof(char*); + npages = state->len / pagesize; + + srand(getpid() ^ (getppid()<<7)); + + lines = words_initialize(nlines, sizeof(char*)); + pages = (char**)malloc(npages * sizeof(char**)); + addr = (char**)malloc(npages * sizeof(char**)); + + state->nwords = 1; + state->nlines = nlines; + state->npages = npages; + state->words = NULL; + state->lines = lines; + state->pages = (size_t*)pages; + state->addr = (char*)addr; + if (addr) bzero(addr, npages * sizeof(char**)); + if (pages) bzero(pages, npages * sizeof(char**)); + + if (addr == NULL || pages == NULL || lines == NULL) { + return; + } + + /* first, layout the sequence of page accesses */ + for (i = 0; i < npages; ++i) { + p = addr[i] = (char*)valloc(pagesize); + if (p == NULL) return; + if ((unsigned long)p % pagesize) { + free(p); + p = addr[i] = (char*)valloc(2 * pagesize); + if (p == NULL) return; + p += pagesize - (unsigned long)p % pagesize; + } + pages[i] = (char*)p; + } + + /* randomize the page sequences (except for zeroth page) */ + r = (rand() << 15) ^ rand(); + for (i = npages - 2; i > 0; --i) { + char* l; + r = (r << 1) ^ (rand() >> 4); + l = pages[(r % i) + 1]; + pages[(r % i) + 1] = pages[i + 1]; + pages[i + 1] = l; + } + + /* now setup run through the pages */ + for (i = 0; i < npages - 1; ++i) { + *(char**)(pages[i] + lines[i%nlines]) = + pages[i+1] + lines[(i+1)%nlines]; + } + *(char**)(pages[i] + lines[i%nlines]) = pages[0] + lines[0]; + state->p[0] = pages[0] + lines[0]; + + /* run through the chain once to clear the cache */ + mem_reset(); + mem_benchmark_0((npages + 100) / 100, state); + + state->initialized = 1; +} + +/* + * words_initialize + * + * This is supposed to create the order in which the words in a + * "cache line" are used. Since we rarely know the cache line + * size with any real reliability, we need to jump around so + * as to maximize the number of potential cache misses, and to + * minimize the possibility of re-using a cache line. + */ +size_t* +words_initialize(size_t max, int scale) +{ + size_t i, j, nbits; + size_t* words = (size_t*)malloc(max * sizeof(size_t)); + + if (!words) return NULL; + + bzero(words, max * sizeof(size_t)); + for (i = max>>1, nbits = 0; i != 0; i >>= 1, nbits++) + ; + for (i = 0; i < max; ++i) { + /* now reverse the bits */ + for (j = 0; j < nbits; j++) { + if (i & (1<<j)) { + words[i] |= (1<<(nbits-j-1)); + } + } + words[i] *= scale; + } + return words; +} + + +size_t +line_find(size_t len, int warmup, int repetitions, struct mem_state* state) +{ + size_t i, j, big_jump, line; + size_t maxline = getpagesize() / 16; + double baseline, t; + + big_jump = 0; + line = 0; + + /* + fprintf(stderr, "line_find(%d, ...): entering\n", len); + /**/ + + state->width = 1; + state->line = sizeof(char*); + for (state->addr = NULL; !state->addr && len; ) { + state->len = state->maxlen = len; + line_initialize(0, state); + if (state->addr == NULL) len >>= 1; + } + if (state->addr == NULL) return -1; + + for (i = sizeof(char*); i <= maxline; i<<=1) { + t = line_test(i, warmup, repetitions, state); + + if (t == 0.) break; + + if (i > sizeof(char*)) { + if (t > 1.3 * baseline) { + big_jump = 1; + } else if (big_jump && t < 1.15 * baseline) { + line = (i>>1); + break; + } + } + baseline = t; + } + mem_cleanup(0, state); + /* + fprintf(stderr, "line_find(%d, ...): returning %d\n", len, line); + /**/ + return line; +} + +double +line_test(size_t line, int warmup, int repetitions, struct mem_state* state) +{ + size_t i; + size_t npages = state->npages; + size_t nlines = state->pagesize / line; + double t; + char* p = state->base; + char* first = p + state->pages[0] + state->lines[0]; + char* last = p + state->pages[npages-1] + state->lines[nlines-1]; + result_t *r, *r_save; + + + /* only visit a subset of the lines in each page */ + if (nlines < state->nlines) { + p = state->base; + for (i = 0; i < npages - 1; ++i) { + *(char**)(p + state->pages[i] + state->lines[nlines-1]) = + p + state->pages[i+1] + state->lines[0]; + } + *(char**)(p + state->pages[npages-1] + state->lines[nlines-1]) = + p + state->pages[0] + state->lines[0]; + } + + r_save = get_results(); + r = (result_t*)malloc(sizeof_result(repetitions)); + insertinit(r); + p = first; + for (i = 0; i < repetitions; ++i) { + BENCH1(HUNDRED(p = *(char**)p;),0); + /* + fprintf(stderr, "%d\t%d\t%d\n", line, (int)gettime(), (int)get_n()); + /**/ + insertsort(gettime(), get_n(), r); + } + use_pointer(p); + set_results(r); + t = 10. * (double)gettime() / (double)get_n(); + set_results(r_save); + free(r); + + /* + fprintf(stderr, "%d\t%.5f\t%d\n", line, t, state->len); + /**/ + + /* fixup full path again */ + if (nlines < state->nlines) { + p = state->base; + for (i = 0; i < npages - 1; ++i) { + *(char**)(p + + state->pages[i] + + state->lines[nlines-1]) = + p + + state->pages[i] + + state->lines[nlines]; + } + *(char**)(p + + state->pages[npages-1] + + state->lines[nlines-1]) = + p + + state->pages[npages-1] + + state->lines[nlines]; + } + + return (t); +} + +double +par_mem(size_t len, int warmup, int repetitions, struct mem_state* state) +{ + int i, j, k, n, __n; + double baseline, max_par, par; + + state->width = 1; + max_par = 1.; + __n = 1; + + for (state->addr = NULL; !state->addr && len; ) { + state->len = state->maxlen = len; + mem_initialize(0, state); + if (state->addr == NULL) len >>= 1; + } + if (state->addr == NULL) return -1.; + + for (i = 0; i < MAX_MEM_PARALLELISM; ++i) { + n = len / state->line; + for (j = 0; j <= i; j++) { + size_t nlines = len / state->line; + size_t lines_per_chunk = nlines / (i + 1); + size_t lines_per_page = state->pagesize / state->line; + size_t words_per_chunk = state->nwords / (i + 1); + size_t line = j * lines_per_chunk; + size_t word = (j * state->nwords) / (i + 1); + + /* + if (state->len == 32768 && i == 7) { + fprintf(stderr, "\tj=%d, line=%d, word=%d, page=%d, _line=%d, _word=%d\n", j, line, word, line / lines_per_page, line % lines_per_page, word % state->nwords); + } + /**/ + state->p[j] = state->base + + state->pages[line / lines_per_page] + + state->lines[line % lines_per_page] + + state->words[word % state->nwords]; + } + mem_reset(); + (*mem_benchmarks[i])((len / sizeof(char*) + 100) / 100, state); + BENCH((*mem_benchmarks[i])(__n, state); __n = 1;, 0); + if (i == 0) { + baseline = (double)gettime() / (double)get_n(); + } else if (gettime() > 0) { + par = baseline; + par /= (double)gettime() / (double)((i + 1) * get_n()); + /* + fprintf(stderr, "par_mem(%d): i=%d, p=%5.2f, l=%d, lpp=%d, lpc=%d, nl=%d, wpc=%d\n", len, i, par, state->line, state->pagesize / state->line, (len / state->line) / (i + 1), len / state->line, state->nwords / (i + 1)); + /**/ + if (par > max_par) { + max_par = par; + } + } + } + mem_cleanup(0, state); + + return max_par; +} + + diff --git a/performance/lmbench3/src/lib_mem.h b/performance/lmbench3/src/lib_mem.h new file mode 100644 index 0000000..5268515 --- /dev/null +++ b/performance/lmbench3/src/lib_mem.h @@ -0,0 +1,60 @@ +#ifndef LMBENCH_MEM_H +#define LMBENCH_MEM_H + + +#define MAX_MEM_PARALLELISM 16 +#define MEM_BENCHMARK_DECL(N) \ + void mem_benchmark_##N(iter_t iterations, void* cookie); + +#define REPEAT_0(m) m(0) +#define REPEAT_1(m) REPEAT_0(m) m(1) +#define REPEAT_2(m) REPEAT_1(m) m(2) +#define REPEAT_3(m) REPEAT_2(m) m(3) +#define REPEAT_4(m) REPEAT_3(m) m(4) +#define REPEAT_5(m) REPEAT_4(m) m(5) +#define REPEAT_6(m) REPEAT_5(m) m(6) +#define REPEAT_7(m) REPEAT_6(m) m(7) +#define REPEAT_8(m) REPEAT_7(m) m(8) +#define REPEAT_9(m) REPEAT_8(m) m(9) +#define REPEAT_10(m) REPEAT_9(m) m(10) +#define REPEAT_11(m) REPEAT_10(m) m(11) +#define REPEAT_12(m) REPEAT_11(m) m(12) +#define REPEAT_13(m) REPEAT_12(m) m(13) +#define REPEAT_14(m) REPEAT_13(m) m(14) +#define REPEAT_15(m) REPEAT_14(m) m(15) + +struct mem_state { + char* addr; /* raw pointer returned by malloc */ + char* base; /* page-aligned pointer */ + char* p[MAX_MEM_PARALLELISM]; + int initialized; + int width; + size_t len; + size_t maxlen; + size_t line; + size_t pagesize; + size_t nlines; + size_t npages; + size_t nwords; + size_t* pages; + size_t* lines; + size_t* words; +}; + +void stride_initialize(iter_t iterations, void* cookie); +void thrash_initialize(iter_t iterations, void* cookie); +void mem_initialize(iter_t iterations, void* cookie); +void line_initialize(iter_t iterations, void* cookie); +void tlb_initialize(iter_t iterations, void* cookie); +void mem_cleanup(iter_t iterations, void* cookie); +void tlb_cleanup(iter_t iterations, void* cookie); + +REPEAT_15(MEM_BENCHMARK_DECL) +extern benchmp_f mem_benchmarks[]; + +size_t line_find(size_t l, int warmup, int repetitions, struct mem_state* state); +double line_test(size_t l, int warmup, int repetitions, struct mem_state* state); +double par_mem(size_t l, int warmup, int repetitions, struct mem_state* state); + +#endif /* LMBENCH_MEM_H */ + diff --git a/performance/lmbench3/src/lib_sched.c b/performance/lmbench3/src/lib_sched.c new file mode 100644 index 0000000..035925b --- /dev/null +++ b/performance/lmbench3/src/lib_sched.c @@ -0,0 +1,239 @@ +#include "bench.h" + +/* #define _DEBUG */ + +#if defined(HAVE_SYSMP) +#include <sys/sysmp.h> +#include <sys/sysinfo.h> +#endif + +#if defined(HAVE_MPCTL) +#include <sys/mpctl.h> +#endif + +#if defined(HAVE_BINDPROCESSOR) +#include <sys/processor.h> +#endif + +#if defined(HAVE_PROCESSOR_BIND) +#include <sys/types.h> +#include <sys/processor.h> +#include <sys/procset.h> +#endif + +#if defined(HAVE_SCHED_SETAFFINITY) +#include <sched.h> +#endif + +extern int custom(char* str, int cpu); +extern int reverse_bits(int cpu); +extern int sched_ncpus(); +extern int sched_pin(int cpu); + +/* + * The interface used by benchmp. + * + * childno is the "logical" child id number. + * In range [0, ..., parallel-1]. + * benchproc is the "logical" id within the benchmark process. The + * benchmp-created process is logical ID zero, child processes + * created by the benchmark range from [1, ..., nbenchprocs]. + * nbenchprocs is the number of child processes that each benchmark + * process will create. Most benchmarks will leave this zero, + * but some such as the pipe() benchmarks will not. + */ +int +handle_scheduler(int childno, int benchproc, int nbenchprocs) +{ + int cpu = 0; + char* sched = getenv("LMBENCH_SCHED"); + + if (!sched || strcasecmp(sched, "DEFAULT") == 0) { + /* do nothing. Allow scheduler to control placement */ + return 0; + } else if (strcasecmp(sched, "SINGLE") == 0) { + /* assign all processes to CPU 0 */ + cpu = 0; + } else if (strcasecmp(sched, "BALANCED") == 0) { + /* assign each benchmark process to its own processor, + * but child processes will share the CPU with the + * parent. + */ + cpu = childno; + } else if (strcasecmp(sched, "BALANCED_SPREAD") == 0) { + /* + * assign each benchmark process to its own processor, + * logically as far away from neighboring IDs as + * possible. This can help identify bus contention + * issues in SMPs with hierarchical busses or NUMA + * memory. + */ + cpu = reverse_bits(childno); + } else if (strcasecmp(sched, "UNIQUE") == 0) { + /* + * assign each benchmark process and each child process + * to its own processor. + */ + cpu = childno * (nbenchprocs + 1) + benchproc; + } else if (strcasecmp(sched, "UNIQUE_SPREAD") == 0) { + /* + * assign each benchmark process and each child process + * to its own processor, logically as far away from + * neighboring IDs as possible. This can help identify + * bus contention issues in SMPs with hierarchical busses + * or NUMA memory. + */ + cpu = reverse_bits(childno * (nbenchprocs + 1) + benchproc); + } else if (strncasecmp(sched, "CUSTOM ", strlen("CUSTOM ")) == 0) { + cpu = custom(sched + strlen("CUSTOM"), childno); + } else if (strncasecmp(sched, "CUSTOM_UNIQUE ", strlen("CUSTOM_UNIQUE ")) == 0) { + cpu = custom(sched + strlen("CUSTOM_UNIQUE"), + childno * (nbenchprocs + 1) + benchproc); + } else { + /* default action: do nothing */ + return; + } + + return sched_pin(cpu % sched_ncpus()); +} + +/* + * Use to get sequentially created processes "far" away from + * each other in an SMP. + * + * XXX: probably doesn't work for NCPUS not a power of two. + */ +int +reverse_bits(int cpu) +{ + int i; + int nbits; + int max = sched_ncpus() - 1; + int cpu_reverse = 0; + + for (i = max>>1, nbits = 1; i > 0; i >>= 1, nbits++) + ; + /* now reverse the bits */ + for (i = 0; i < nbits; i++) { + if (cpu & (1<<i)) + cpu_reverse |= (1<<(nbits-i-1)); + } + return cpu_reverse; +} + +/* + * Custom is a user-defined sequence of CPU ids + */ +int +custom(char* str, int cpu) +{ + static int nvalues = -1; + static int* values = NULL; + + if (values == NULL) { + nvalues = 0; + values = (int*)malloc(sizeof(int)); + + while (*str) { + char* q; + while (*str && !isdigit(*str)) str++; + q = str; + while (*str && isdigit(*str)) str++; + if (str == q) break; + *str++ = 0; + sscanf(q, "%d", &values[nvalues++]); + values = (int*)realloc((void*)values, (nvalues + 1) * sizeof(int)); + } + } + if (nvalues == 0) return 0; + return values[cpu % nvalues]; +} + +/* + * Return the number of processors in this host + */ +int +sched_ncpus() +{ +#ifdef MP_NPROCS + /* SGI IRIX interface */ + return sysmp(MP_NPROCS); +#elif defined(HAVE_MPCTL) + /* HP-UX interface */ + return mpctl(MPC_GETNUMSPUS_SYS, 0, 0); +#elif defined(_SC_NPROCESSORS_ONLN) + /* AIX, Solaris, and Linux interface */ + return sysconf(_SC_NPROCESSORS_ONLN); +#endif + return 1; +} + +/* + * Pin the current process to the given CPU + * + * return 0 when successful + * returns -1 on error + */ +int +sched_pin(int cpu) +{ + int retval = -1; + +#ifdef HAVE_SYSMP + /* SGI IRIX interface */ + retval = sysmp(MP_MUSTRUN, cpu); +#elif defined(HAVE_MPCTL) + /* HP-UX interface */ + retval = mpctl(MPC_SET_PROCESS, cpu, MPC_SELFPID); +#elif defined(HAVE_BINDPROCESSOR) + /* AIX interface */ + retval = bindprocessor(BINDPROCESS, getpid(), cpu); +#elif defined(HAVE_PROCESSOR_BIND) + /* Solaris interface */ + retval = processor_bind(P_PID, P_MYPID, cpu, NULL); +#elif defined(HAVE_SCHED_SETAFFINITY) + /* Linux interface */ + static unsigned long* mask = NULL; + static unsigned long* cpumask = NULL; + static int sz = 0; + static int ncpus = 0; + int i; + int j; + + if (cpumask == NULL) { + sz = 1 + (2 * sched_ncpus()) / (8 * sizeof(unsigned long)); + mask = (unsigned long*)malloc(sz * sizeof(unsigned long)); + cpumask = (unsigned long*)malloc(sz * sizeof(unsigned long)); + retval = sched_getaffinity(0, sz * sizeof(unsigned long), cpumask); + if (retval < 0) perror("sched_getaffinity:"); + if (retval < 0) return retval; + + for (i = 0; i < sz * 8 * sizeof(unsigned long); ++i) { + int word = i / (8 * sizeof(unsigned long)); + int bit = i % (8 * sizeof(unsigned long)); + if (cpumask[word] & (1 << bit)) ncpus++; + } + } + cpu %= ncpus; + + bzero(mask, sz * sizeof(unsigned long)); + for (i = 0, j = 0; i < sz * 8 * sizeof(unsigned long); ++i) { + int word = i / (8 * sizeof(unsigned long)); + int bit = i % (8 * sizeof(unsigned long)); + if (cpumask[word] & (1 << bit)) { + if (j >= cpu) { + mask[word] |= (1 << bit); + break; + } + j++; + } + } + retval = sched_setaffinity(0, sz * sizeof(unsigned long), mask); + if (retval < 0) perror("sched_setaffinity:"); +#ifdef _DEBUG + fprintf(stderr, "sched_pin(%d): pid=%d, returning %d\n", cpu, (int)getpid(), retval); +#endif /* _DEBUG */ + +#endif + return retval; +} diff --git a/performance/lmbench3/src/lib_stats.c b/performance/lmbench3/src/lib_stats.c new file mode 100644 index 0000000..cc8b5a6 --- /dev/null +++ b/performance/lmbench3/src/lib_stats.c @@ -0,0 +1,603 @@ +#include <math.h> +#include "bench.h" + +#define BOOTSTRAP_COUNT 200 + +/* + * a comparison function used by qsort + */ +int +int_compare(const void *a, const void *b) +{ + if (*(int*)a < *(int*)b) return -1; + if (*(int*)a > *(int*)b) return 1; + return 0; +} + +/* + * a comparison function used by qsort + */ +int +uint64_compare(const void *a, const void *b) +{ + if (*(uint64*)a < *(uint64*)b) return -1; + if (*(uint64*)a > *(uint64*)b) return 1; + return 0; +} + +/* + * a comparison function used by qsort + */ +int +double_compare(const void *a, const void *b) +{ + if (*(double*)a < *(double*)b) return -1; + if (*(double*)a > *(double*)b) return 1; + return 0; +} + +/* + * return the median value of an array of int + */ +int +int_median(int *values, int size) +{ + qsort(values, size, sizeof(int), int_compare); + + if (size == 0) return 0.; + + if (size % 2) { + return values[size/2]; + } + + return (values[size/2 - 1] + values[size/2]) / 2; +} + +/* + * return the median value of an array of int + */ +uint64 +uint64_median(uint64 *values, int size) +{ + qsort(values, size, sizeof(uint64), uint64_compare); + + if (size == 0) return 0.; + + if (size % 2) { + return values[size/2]; + } + + return (values[size/2 - 1] + values[size/2]) / 2; +} + +/* + * return the median value of an array of doubles + */ +double +double_median(double *values, int size) +{ + qsort(values, size, sizeof(double), double_compare); + + if (size == 0) return 0.; + + if (size % 2) { + return values[size/2]; + } + + return (values[size/2 - 1] + values[size/2]) / 2.0; +} + +/* + * return the mean value of an array of int + */ +int +int_mean(int *values, int size) +{ + int i; + int sum = 0; + + for (i = 0; i < size; ++i) + sum += values[i]; + + return sum / size; +} + +/* + * return the mean value of an array of int + */ +uint64 +uint64_mean(uint64 *values, int size) +{ + int i; + uint64 sum = 0; + + for (i = 0; i < size; ++i) + sum += values[i]; + + return sum / size; +} + +/* + * return the mean value of an array of doubles + */ +double +double_mean(double *values, int size) +{ + int i; + double sum = 0.0; + + for (i = 0; i < size; ++i) + sum += values[i]; + + return sum / (double)size; +} + +/* + * return the min value of an array of int + */ +int +int_min(int *values, int size) +{ + int i; + int min = values[0]; + + for (i = 1; i < size; ++i) + if (values[i] < min) min = values[i]; + + return min; +} + +/* + * return the min value of an array of int + */ +uint64 +uint64_min(uint64 *values, int size) +{ + int i; + uint64 min = values[0]; + + for (i = 1; i < size; ++i) + if (values[i] < min) min = values[i]; + + return min; +} + +/* + * return the min value of an array of doubles + */ +double +double_min(double *values, int size) +{ + int i; + double min = values[0]; + + for (i = 1; i < size; ++i) + if (values[i] < min) min = values[i]; + + return min; +} + +/* + * return the max value of an array of int + */ +int +int_max(int *values, int size) +{ + int i; + int max = values[0]; + + for (i = 1; i < size; ++i) + if (values[i] > max) max = values[i]; + + return max; +} + +/* + * return the max value of an array of int + */ +uint64 +uint64_max(uint64 *values, int size) +{ + int i; + uint64 max = values[0]; + + for (i = 1; i < size; ++i) + if (values[i] > max) max = values[i]; + + return max; +} + +/* + * return the max value of an array of doubles + */ +double +double_max(double *values, int size) +{ + int i; + double max = values[0]; + + for (i = 1; i < size; ++i) + if (values[i] > max) max = values[i]; + + return max; +} + +/* + * return the variance of an array of ints + * + * Reference: "Statistics for Experimenters" by + * George E.P. Box et. al., page 41 + */ +double int_variance(int *values, int size) +{ + int i; + double sum = 0.0; + int mean = int_mean(values, size); + + for (i = 0; i < size; ++i) + sum += (double)((values[i] - mean) * (values[i] - mean)); + + return sum / (double)(size - 1); +} + +/* + * return the variance of an array of uint64s + */ +double uint64_variance(uint64 *values, int size) +{ + int i; + double sum = 0.0; + uint64 mean = uint64_mean(values, size); + + for (i = 0; i < size; ++i) + sum += (double)((values[i] - mean) * (values[i] - mean)); + return sum / (double)(size - 1); +} + +/* + * return the variance of an array of doubles + */ +double double_variance(double *values, int size) +{ + int i; + double sum = 0.0; + double mean = double_mean(values, size); + + for (i = 0; i < size; ++i) + sum += (double)((values[i] - mean) * (values[i] - mean)); + + return sum / (double)(size - 1); +} + +/* + * return the moment of an array of ints + * + * Reference: "Statistics for Experimenters" by + * George E.P. Box et. al., page 41, 90 + */ +double int_moment(int moment, int *values, int size) +{ + int i, j; + double sum = 0.0; + int mean = int_mean(values, size); + + for (i = 0; i < size; ++i) { + double diff = values[i] - mean; + double m = diff; + for (j = 1; j < moment; ++j) + m *= diff; + sum += m; + } + + return sum / (double)size; +} + +/* + * return the moment of an array of uint64s + */ +double uint64_moment(int moment, uint64 *values, int size) +{ + int i, j; + double sum = 0.0; + uint64 mean = uint64_mean(values, size); + + for (i = 0; i < size; ++i) { + double diff = values[i] - mean; + double m = diff; + for (j = 1; j < moment; ++j) + m *= diff; + sum += m; + } + + return sum / (double)size; +} + +/* + * return the moment of an array of doubles + */ +double double_moment(int moment, double *values, int size) +{ + int i, j; + double sum = 0.0; + double mean = double_mean(values, size); + + for (i = 0; i < size; ++i) { + double diff = values[i] - mean; + double m = diff; + for (j = 1; j < moment; ++j) + m *= diff; + sum += m; + } + + return sum / (double)size; +} + +/* + * return the standard error of an array of ints + * + * Reference: "Statistics for Experimenters" by + * George E.P. Box et. al., page 41 + */ +double int_stderr(int *values, int size) +{ + return sqrt(int_variance(values, size)); +} + +/* + * return the standard error of an array of uint64s + */ +double uint64_stderr(uint64 *values, int size) +{ + return sqrt(uint64_variance(values, size)); +} + +/* + * return the standard error of an array of doubles + */ +double double_stderr(double *values, int size) +{ + return sqrt(double_variance(values, size)); +} + +/* + * return the skew of an array of ints + * + */ +double int_skew(int *values, int size) +{ + double sigma = int_stderr(values, size); + double moment3 = int_moment(3, values, size); + + return moment3 / (sigma * sigma * sigma); +} + +/* + * return the skew of an array of uint64s + */ +double uint64_skew(uint64 *values, int size) +{ + double sigma = uint64_stderr(values, size); + double moment3 = uint64_moment(3, values, size); + + return moment3 / (sigma * sigma * sigma); +} + +/* + * return the skew of an array of doubles + */ +double double_skew(double *values, int size) +{ + double sigma = double_stderr(values, size); + double moment3 = double_moment(3, values, size); + + return moment3 / (sigma * sigma * sigma); +} + +/* + * return the kurtosis of an array of ints + * + * Reference: "Statistics for Experimenters" by + * George E.P. Box et. al., page 90; + */ +double int_kurtosis(int *values, int size) +{ + double variance = int_variance(values, size); + double moment4 = int_moment(4, values, size); + + return moment4 / (variance * variance) - 3; +} + +/* + * return the kurtosis of an array of uint64s + */ +double uint64_kurtosis(uint64 *values, int size) +{ + double variance = uint64_variance(values, size); + double moment4 = uint64_moment(4, values, size); + + return moment4 / (variance * variance) - 3; +} + +/* + * return the kurtosis of an array of doubles + */ +double double_kurtosis(double *values, int size) +{ + double variance = double_variance(values, size); + double moment4 = double_moment(4, values, size); + + return moment4 / (variance * variance) - 3; +} + +/* + * BOOTSTRAP: + * + * stderr = sqrt(sum_i(s[i] - sum_j(s[j])/B)**2 / (B - 1)) + * + * Reference: "An Introduction to the Bootstrap" by Bradley + * Efron and Robert J. Tibshirani, page 12. + */ + +/* + * return the bootstrap estimation of the standard error + * of an array of ints + */ +double int_bootstrap_stderr(int *values, int size, int_stat f) +{ + int i, j; + int *samples = (int*)malloc(size * sizeof(int)); + double *s = (double*)malloc(BOOTSTRAP_COUNT * sizeof(double)); + double s_sum = 0; + double sum = 0; + + /* generate the stderr for each of the bootstrap samples */ + for (i = 0; i < BOOTSTRAP_COUNT; ++i) { + for (j = 0; j < size; ++j) + samples[j] = values[rand() % size]; + s[i] = (double)(*f)(samples, size); + s_sum += s[i]; /* CHS: worry about overflow */ + } + s_sum /= (double)BOOTSTRAP_COUNT; + + for (i = 0; i < BOOTSTRAP_COUNT; ++i) + sum += (s[i] - s_sum) * (s[i] - s_sum); + + sum /= (double)(BOOTSTRAP_COUNT - 1); + + free(samples); + free(s); + + return sqrt(sum); +} + +/* + * return the bootstrap estimation of the standard error + * of an array of uint64s + */ +double uint64_bootstrap_stderr(uint64 *values, int size, uint64_stat f) +{ + int i, j; + uint64 *samples = (uint64*)malloc(size * sizeof(uint64)); + double *s = (double*)malloc(BOOTSTRAP_COUNT * sizeof(double)); + double s_sum; + double sum; + + /* generate the stderr for each of the bootstrap samples */ + for (i = 0, s_sum = 0.0; i < BOOTSTRAP_COUNT; ++i) { + for (j = 0; j < size; ++j) + samples[j] = values[rand() % size]; + s[i] = (double)(*f)(samples, size); + s_sum += s[i]; /* CHS: worry about overflow */ + } + s_sum /= (double)BOOTSTRAP_COUNT; + + for (i = 0, sum = 0.0; i < BOOTSTRAP_COUNT; ++i) + sum += (s[i] - s_sum) * (s[i] - s_sum); + + free(samples); + free(s); + + return sqrt(sum/(double)(BOOTSTRAP_COUNT - 1)); +} + +/* + * return the bootstrap estimation of the standard error + * of an array of doubles + */ +double double_bootstrap_stderr(double *values, int size, double_stat f) +{ + int i, j; + double *samples = (double*)malloc(size * sizeof(double)); + double *s = (double*)malloc(BOOTSTRAP_COUNT * sizeof(double)); + double s_sum = 0; + double sum = 0; + + /* generate the stderr for each of the bootstrap samples */ + for (i = 0; i < BOOTSTRAP_COUNT; ++i) { + for (j = 0; j < size; ++j) + samples[j] = values[rand() % size]; + s[i] = (*f)(samples, size); + s_sum += (double)s[i]; /* CHS: worry about overflow */ + } + s_sum /= (double)BOOTSTRAP_COUNT; + + for (i = 0; i < BOOTSTRAP_COUNT; ++i) + sum += (s[i] - s_sum) * (s[i] - s_sum); + + sum /= (double)(BOOTSTRAP_COUNT - 1); + + free(samples); + free(s); + + return sqrt(sum); +} + +/* + * regression(x, y, sig, n, a, b, sig_a, sig_b, chi2) + * + * This routine is derived from equations in "Numerical Recipes in C" + * (second edition) by Press, et. al., pages 661-665. + * + * compute the linear regression y = a + bx for (x,y), where y[i] has + * standard deviation sig[i]. + * + * returns the coefficients a and b, along with an estimation of their + * error (standard deviation) in sig_a and sig_b. + * + * returns chi2 for "goodness of fit" information. + */ + +void +regression(double *x, double *y, double *sig, int n, + double *a, double *b, double *sig_a, double *sig_b, + double *chi2) +{ + int i; + double S = 0.0, Sx = 0.0, Sy = 0.0, Stt = 0.0, Sx_S; + + /* compute some basic statistics */ + for (i = 0; i < n; ++i) { + /* Equations 15.2.4: for S, Sx, Sy */ + double weight = 1.0 / (sig ? sig[i] * sig[i] : 1.0); + S += weight; + Sx += weight * x[i]; + Sy += weight * y[i]; + } + + *b = 0.0; + Sx_S = Sx / S; + for (i = 0; i < n; ++i) { + /* + * Equation 15.2.15 for t + * Equation 15.2.16 for Stt + * Equation 15.2.17 for b, do summation portion of equation + * compute Sum i=0,n-1 (t_i * y[i] / sig[i])) + */ + double t_i = (x[i] - Sx_S) / (sig ? sig[i] : 1.0); + Stt += t_i * t_i; + *b += t_i * y[i] / (sig ? sig[i] : 1.0); + } + + /* + * Equation 15.2.17 for b, do 1/Stt * summation + * Equation 15.2.18 for a + * Equation 15.2.19 for sig_a + * Equation 15.2.20 for sig_b + */ + *b /= Stt; + *a = (Sy - *b * Sx) / S; + *sig_a = sqrt((1.0 + (Sx * Sx) / (S * Stt)) / S); + *sig_b = sqrt(1.0 / Stt); + + /* Equation 15.2.2 for chi2, the merit function */ + *chi2 = 0.0; + for (i = 0; i < n; ++i) { + double merit = (y[i] - ((*a) + (*b) * x[i])) / (sig ? sig[i] : 1.0); + *chi2 += merit * merit; + } + if (sig == NULL) { + *sig_a *= sqrt((*chi2) / (n - 2)); + *sig_b *= sqrt((*chi2) / (n - 2)); + } +} + diff --git a/performance/lmbench3/src/lib_tcp.c b/performance/lmbench3/src/lib_tcp.c new file mode 100644 index 0000000..d84a63e --- /dev/null +++ b/performance/lmbench3/src/lib_tcp.c @@ -0,0 +1,238 @@ +/* + * tcp_lib.c - routines for managing TCP connections. + * + * Positive port/program numbers are RPC ports, negative ones are TCP ports. + * + * Copyright (c) 1994-1996 Larry McVoy. + */ +#define _LIB /* bench.h needs this */ +#include "bench.h" + +/* + * Get a TCP socket, bind it, figure out the port, + * and advertise the port as program "prog". + * + * XXX - it would be nice if you could advertise ascii strings. + */ +int +tcp_server(int prog, int rdwr) +{ + int sock; + struct sockaddr_in s; + +#ifdef LIBTCP_VERBOSE + fprintf(stderr, "tcp_server(%u, %u)\n", prog, rdwr); +#endif + if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + perror("socket"); + exit(1); + } + sock_optimize(sock, rdwr); + bzero((void*)&s, sizeof(s)); + s.sin_family = AF_INET; + if (prog < 0) { + s.sin_port = htons(-prog); + } + if (bind(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { + perror("bind"); + exit(2); + } + if (listen(sock, 100) < 0) { + perror("listen"); + exit(4); + } + if (prog > 0) { +#ifdef LIBTCP_VERBOSE + fprintf(stderr, "Server port %d\n", sockport(sock)); +#endif + (void)pmap_unset((u_long)prog, (u_long)1); + if (!pmap_set((u_long)prog, (u_long)1, (u_long)IPPROTO_TCP, + (unsigned short)sockport(sock))) { + perror("pmap_set"); + exit(5); + } + } + return (sock); +} + +/* + * Unadvertise the socket + */ +int +tcp_done(int prog) +{ + if (prog > 0) { + pmap_unset((u_long)prog, (u_long)1); + } + return (0); +} + +/* + * Accept a connection and return it + */ +int +tcp_accept(int sock, int rdwr) +{ + struct sockaddr_in s; + int newsock, namelen; + + namelen = sizeof(s); + bzero((void*)&s, namelen); + +retry: + if ((newsock = accept(sock, (struct sockaddr*)&s, &namelen)) < 0) { + if (errno == EINTR) + goto retry; + perror("accept"); + exit(6); + } +#ifdef LIBTCP_VERBOSE + fprintf(stderr, "Server newsock port %d\n", sockport(newsock)); +#endif + sock_optimize(newsock, rdwr); + return (newsock); +} + +/* + * Connect to the TCP socket advertised as "prog" on "host" and + * return the connected socket. + * + * Hacked Thu Oct 27 1994 to cache pmap_getport calls. This saves + * about 4000 usecs in loopback lat_connect calls. I suppose we + * should time gethostbyname() & pmap_getprot(), huh? + */ +int +tcp_connect(char *host, int prog, int rdwr) +{ + static struct hostent *h; + static struct sockaddr_in s; + static u_short save_port; + static u_long save_prog; + static char *save_host; + int sock; + static int tries = 0; + + if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { + perror("socket"); + exit(1); + } + if (rdwr & SOCKOPT_PID) { + static unsigned short port; + struct sockaddr_in sin; + + if (!port) { + port = (unsigned short)(getpid() << 4); + if (port < 1024) { + port += 1024; + } + } + do { + port++; + bzero((void*)&sin, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_port = htons(port); + } while (bind(sock, (struct sockaddr*)&sin, sizeof(sin)) == -1); + } +#ifdef LIBTCP_VERBOSE + else { + struct sockaddr_in sin; + + bzero((void*)&sin, sizeof(sin)); + sin.sin_family = AF_INET; + if (bind(sock, (struct sockaddr*)&sin, sizeof(sin)) < 0) { + perror("bind"); + exit(2); + } + } + fprintf(stderr, "Client port %d\n", sockport(sock)); +#endif + sock_optimize(sock, rdwr); + if (!h || host != save_host || prog != save_prog) { + save_host = host; /* XXX - counting on them not + * changing it - benchmark only. + */ + save_prog = prog; + if (!(h = gethostbyname(host))) { + perror(host); + exit(2); + } + bzero((void *) &s, sizeof(s)); + s.sin_family = AF_INET; + bcopy((void*)h->h_addr, (void *)&s.sin_addr, h->h_length); + if (prog > 0) { + save_port = pmap_getport(&s, prog, + (u_long)1, IPPROTO_TCP); + if (!save_port) { + perror("lib TCP: No port found"); + exit(3); + } +#ifdef LIBTCP_VERBOSE + fprintf(stderr, "Server port %d\n", save_port); +#endif + s.sin_port = htons(save_port); + } else { + s.sin_port = htons(-prog); + } + } + if (connect(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { + if (errno == ECONNRESET + || errno == ECONNREFUSED + || errno == EAGAIN) { + close(sock); + if (++tries > 10) return(-1); + return (tcp_connect(host, prog, rdwr)); + } + perror("connect"); + exit(4); + } + tries = 0; + return (sock); +} + +void +sock_optimize(int sock, int flags) +{ + if (flags & SOCKOPT_READ) { + int sockbuf = SOCKBUF; + + while (setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &sockbuf, + sizeof(int))) { + sockbuf >>= 1; + } +#ifdef LIBTCP_VERBOSE + fprintf(stderr, "sockopt %d: RCV: %dK\n", sock, sockbuf>>10); +#endif + } + if (flags & SOCKOPT_WRITE) { + int sockbuf = SOCKBUF; + + while (setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &sockbuf, + sizeof(int))) { + sockbuf >>= 1; + } +#ifdef LIBTCP_VERBOSE + fprintf(stderr, "sockopt %d: SND: %dK\n", sock, sockbuf>>10); +#endif + } + if (flags & SOCKOPT_REUSE) { + int val = 1; + if (setsockopt(sock, SOL_SOCKET, + SO_REUSEADDR, &val, sizeof(val)) == -1) { + perror("SO_REUSEADDR"); + } + } +} + +int +sockport(int s) +{ + int namelen; + struct sockaddr_in sin; + + namelen = sizeof(sin); + if (getsockname(s, (struct sockaddr *)&sin, &namelen) < 0) { + perror("getsockname"); + return(-1); + } + return ((int)ntohs(sin.sin_port)); +} diff --git a/performance/lmbench3/src/lib_tcp.h b/performance/lmbench3/src/lib_tcp.h new file mode 100644 index 0000000..bc820b2 --- /dev/null +++ b/performance/lmbench3/src/lib_tcp.h @@ -0,0 +1,12 @@ +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netdb.h> +#include <arpa/inet.h> + +int tcp_server(int prog, int rdwr); +int tcp_done(int prog); +int tcp_accept(int sock, int rdwr); +int tcp_connect(char *host, int prog, int rdwr); +void sock_optimize(int sock, int rdwr); +int sockport(int s); diff --git a/performance/lmbench3/src/lib_timing.c b/performance/lmbench3/src/lib_timing.c new file mode 100644 index 0000000..af8cf68 --- /dev/null +++ b/performance/lmbench3/src/lib_timing.c @@ -0,0 +1,1774 @@ +/* + * a timing utilities library + * + * Requires 64bit integers to work. + * + * %W% %@% + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994-1998 Larry McVoy. + * Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +#define _LIB /* bench.h needs this */ +#include "bench.h" + +/* #define _DEBUG */ + +#define nz(x) ((x) == 0 ? 1 : (x)) + +/* + * I know you think these should be 2^10 and 2^20, but people are quoting + * disk sizes in powers of 10, and bandwidths are all power of ten. + * Deal with it. + */ +#define MB (1000*1000.0) +#define KB (1000.0) + +static struct timeval start_tv, stop_tv; +FILE *ftiming; +static volatile uint64 use_result_dummy; +static uint64 iterations; +static void init_timing(void); + +#if defined(hpux) || defined(__hpux) +#include <sys/mman.h> +#endif + +#ifdef RUSAGE +#include <sys/resource.h> +#define SECS(tv) (tv.tv_sec + tv.tv_usec / 1000000.0) +#define mine(f) (int)(ru_stop.f - ru_start.f) + +static struct rusage ru_start, ru_stop; + +void +rusage(void) +{ + double sys, user, idle; + double per; + + sys = SECS(ru_stop.ru_stime) - SECS(ru_start.ru_stime); + user = SECS(ru_stop.ru_utime) - SECS(ru_start.ru_utime); + idle = timespent() - (sys + user); + per = idle / timespent() * 100; + if (!ftiming) ftiming = stderr; + fprintf(ftiming, "real=%.2f sys=%.2f user=%.2f idle=%.2f stall=%.0f%% ", + timespent(), sys, user, idle, per); + fprintf(ftiming, "rd=%d wr=%d min=%d maj=%d ctx=%d\n", + mine(ru_inblock), mine(ru_oublock), + mine(ru_minflt), mine(ru_majflt), + mine(ru_nvcsw) + mine(ru_nivcsw)); +} + +#endif /* RUSAGE */ + +void +lmbench_usage(int argc, char *argv[], char* usage) +{ + fprintf(stderr,"Usage: %s %s", argv[0], usage); + exit(-1); +} + + +void +sigchld_wait_handler(int signo) +{ + wait(0); + signal(SIGCHLD, sigchld_wait_handler); +} + +static int benchmp_sigterm_received; +static int benchmp_sigchld_received; +static pid_t benchmp_sigalrm_pid; +static int benchmp_sigalrm_timeout; +void (*benchmp_sigterm_handler)(int); +void (*benchmp_sigchld_handler)(int); +void (*benchmp_sigalrm_handler)(int); + +void +benchmp_sigterm(int signo) +{ + benchmp_sigterm_received = 1; +} + +void +benchmp_sigchld(int signo) +{ + signal(SIGCHLD, SIG_DFL); + benchmp_sigchld_received = 1; +#ifdef _DEBUG + fprintf(stderr, "benchmp_sigchld handler\n"); +#endif +} + +void +benchmp_sigalrm(int signo) +{ + signal(SIGALRM, SIG_IGN); + kill(benchmp_sigalrm_pid, SIGTERM); + /* + * Since we already waited a full timeout period for the child + * to die, we only need to wait a little longer for subsequent + * children to die. + */ + benchmp_sigalrm_timeout = 1; +} + +void +benchmp_child(benchmp_f initialize, + benchmp_f benchmark, + benchmp_f cleanup, + int childid, + int response, + int start_signal, + int result_signal, + int exit_signal, + int parallel, + iter_t iterations, + int repetitions, + int enough, + void* cookie + ); +void +benchmp_parent(int response, + int start_signal, + int result_signal, + int exit_signal, + pid_t* pids, + int parallel, + iter_t iterations, + int warmup, + int repetitions, + int enough + ); + +int +sizeof_result(int repetitions); + +void +benchmp(benchmp_f initialize, + benchmp_f benchmark, + benchmp_f cleanup, + int enough, + int parallel, + int warmup, + int repetitions, + void* cookie) +{ + iter_t iterations = 1; + double result = 0.; + double usecs; + long i, j; + pid_t pid; + pid_t *pids = NULL; + int response[2]; + int start_signal[2]; + int result_signal[2]; + int exit_signal[2]; + int need_warmup; + fd_set fds; + struct timeval timeout; + +#ifdef _DEBUG + fprintf(stderr, "benchmp(%p, %p, %p, %d, %d, %d, %d, %p): entering\n", initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie); +#endif + enough = get_enough(enough); +#ifdef _DEBUG + fprintf(stderr, "\tenough=%d\n", enough); +#endif + + /* initialize results */ + settime(0); + save_n(1); + + if (parallel > 1) { + /* Compute the baseline performance */ + benchmp(initialize, benchmark, cleanup, + enough, 1, warmup, repetitions, cookie); + + /* if we can't even do a single job, then give up */ + if (gettime() == 0) + return; + + /* calculate iterations for 1sec runtime */ + iterations = get_n(); + if (enough < SHORT) { + double tmp = (double)SHORT * (double)get_n(); + tmp /= (double)gettime(); + iterations = (iter_t)tmp + 1; + } + settime(0); + save_n(1); + } + + /* Create the necessary pipes for control */ + if (pipe(response) < 0 + || pipe(start_signal) < 0 + || pipe(result_signal) < 0 + || pipe(exit_signal) < 0) { +#ifdef _DEBUG + fprintf(stderr, "BENCHMP: Could not create control pipes\n"); +#endif /* _DEBUG */ + return; + } + + /* fork the necessary children */ + benchmp_sigchld_received = 0; + benchmp_sigterm_received = 0; + benchmp_sigterm_handler = signal(SIGTERM, benchmp_sigterm); + benchmp_sigchld_handler = signal(SIGCHLD, benchmp_sigchld); + pids = (pid_t*)malloc(parallel * sizeof(pid_t)); + if (!pids) return; + bzero((void*)pids, parallel * sizeof(pid_t)); + + for (i = 0; i < parallel; ++i) { + if (benchmp_sigterm_received) + goto error_exit; +#ifdef _DEBUG + fprintf(stderr, "benchmp(%p, %p, %p, %d, %d, %d, %d, %p): creating child %d\n", initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie, i); +#endif + switch(pids[i] = fork()) { + case -1: + /* could not open enough children! */ +#ifdef _DEBUG + fprintf(stderr, "BENCHMP: fork() failed!\n"); +#endif /* _DEBUG */ + goto error_exit; + case 0: + /* If child */ + close(response[0]); + close(start_signal[1]); + close(result_signal[1]); + close(exit_signal[1]); + handle_scheduler(i, 0, 0); + benchmp_child(initialize, + benchmark, + cleanup, + i, + response[1], + start_signal[0], + result_signal[0], + exit_signal[0], + enough, + iterations, + parallel, + repetitions, + cookie + ); + exit(0); + default: + break; + } + } + close(response[1]); + close(start_signal[0]); + close(result_signal[0]); + close(exit_signal[0]); + benchmp_parent(response[0], + start_signal[1], + result_signal[1], + exit_signal[1], + pids, + parallel, + iterations, + warmup, + repetitions, + enough + ); + goto cleanup_exit; + +error_exit: + /* give the children a chance to clean up gracefully */ + signal(SIGCHLD, SIG_DFL); + while (--i >= 0) { + kill(pids[i], SIGTERM); + waitpid(pids[i], NULL, 0); + } + +cleanup_exit: + /* + * Clean up and kill all children + * + * NOTE: the children themselves SHOULD exit, and + * Killing them could prevent them from + * cleanup up subprocesses, etc... So, we only + * want to kill child processes when it appears + * that they will not die of their own accord. + * We wait twice the timing interval plus two seconds + * for children to die. If they haven't died by + * that time, then we start killing them. + */ + benchmp_sigalrm_timeout = (int)((2 * enough)/1000000) + 2; + if (benchmp_sigalrm_timeout < 5) + benchmp_sigalrm_timeout = 5; + signal(SIGCHLD, SIG_DFL); + while (i-- > 0) { + /* wait timeout seconds for child to die, then kill it */ + benchmp_sigalrm_pid = pids[i]; + benchmp_sigalrm_handler = signal(SIGALRM, benchmp_sigalrm); + alarm(benchmp_sigalrm_timeout); + + waitpid(pids[i], NULL, 0); + + alarm(0); + signal(SIGALRM, benchmp_sigalrm_handler); + } + + if (pids) free(pids); +#ifdef _DEBUG + fprintf(stderr, "benchmp(0x%x, 0x%x, 0x%x, %d, %d, 0x%x): exiting\n", (unsigned int)initialize, (unsigned int)benchmark, (unsigned int)cleanup, enough, parallel, (unsigned int)cookie); +#endif +} + +void +benchmp_parent( int response, + int start_signal, + int result_signal, + int exit_signal, + pid_t* pids, + int parallel, + iter_t iterations, + int warmup, + int repetitions, + int enough + ) +{ + int i,j,k,l; + int bytes_read; + result_t* results = NULL; + result_t* merged_results = NULL; + char* signals = NULL; + unsigned char* buf; + fd_set fds_read, fds_error; + struct timeval timeout; + + if (benchmp_sigchld_received || benchmp_sigterm_received) { +#ifdef _DEBUG + fprintf(stderr, "benchmp_parent: entering, benchmp_sigchld_received=%d\n", benchmp_sigchld_received); +#endif + goto error_exit; + } + + results = (result_t*)malloc(sizeof_result(repetitions)); + merged_results = (result_t*)malloc(sizeof_result(parallel * repetitions)); + signals = (char*)malloc(parallel * sizeof(char)); + if (!results || !merged_results || !signals) return; + + /* Collect 'ready' signals */ + for (i = 0; i < parallel * sizeof(char); i += bytes_read) { + bytes_read = 0; + FD_ZERO(&fds_read); + FD_ZERO(&fds_error); + FD_SET(response, &fds_read); + FD_SET(response, &fds_error); + + timeout.tv_sec = 1; + timeout.tv_usec = 0; + select(response+1, &fds_read, NULL, &fds_error, &timeout); + if (benchmp_sigchld_received + || benchmp_sigterm_received + || FD_ISSET(response, &fds_error)) + { +#ifdef _DEBUG + fprintf(stderr, "benchmp_parent: ready, benchmp_sigchld_received=%d\n", benchmp_sigchld_received); +#endif + goto error_exit; + } + if (!FD_ISSET(response, &fds_read)) { + continue; + } + + bytes_read = read(response, signals, parallel * sizeof(char) - i); + if (bytes_read < 0) { +#ifdef _DEBUG + fprintf(stderr, "benchmp_parent: ready, bytes_read=%d, %s\n", bytes_read, strerror(errno)); +#endif + goto error_exit; + } + } + + /* let the children run for warmup microseconds */ + if (warmup > 0) { + struct timeval delay; + delay.tv_sec = warmup / 1000000; + delay.tv_usec = warmup % 1000000; + + select(0, NULL, NULL, NULL, &delay); + } + + /* send 'start' signal */ + write(start_signal, signals, parallel * sizeof(char)); + + /* Collect 'done' signals */ + for (i = 0; i < parallel * sizeof(char); i += bytes_read) { + bytes_read = 0; + FD_ZERO(&fds_read); + FD_ZERO(&fds_error); + FD_SET(response, &fds_read); + FD_SET(response, &fds_error); + + timeout.tv_sec = 1; + timeout.tv_usec = 0; + select(response+1, &fds_read, NULL, &fds_error, &timeout); + if (benchmp_sigchld_received + || benchmp_sigterm_received + || FD_ISSET(response, &fds_error)) + { +#ifdef _DEBUG + fprintf(stderr, "benchmp_parent: done, benchmp_child_died=%d\n", benchmp_sigchld_received); +#endif + goto error_exit; + } + if (!FD_ISSET(response, &fds_read)) { + continue; + } + + bytes_read = read(response, signals, parallel * sizeof(char) - i); + if (bytes_read < 0) { +#ifdef _DEBUG + fprintf(stderr, "benchmp_parent: done, bytes_read=%d, %s\n", bytes_read, strerror(errno)); +#endif + goto error_exit; + } + } + + /* collect results */ + insertinit(merged_results); + for (i = 0; i < parallel; ++i) { + int n = sizeof_result(repetitions); + buf = (unsigned char*)results; + + FD_ZERO(&fds_read); + FD_ZERO(&fds_error); + + /* tell one child to report its results */ + write(result_signal, buf, sizeof(char)); + + for (; n > 0; n -= bytes_read, buf += bytes_read) { + bytes_read = 0; + FD_SET(response, &fds_read); + FD_SET(response, &fds_error); + + timeout.tv_sec = 1; + timeout.tv_usec = 0; + select(response+1, &fds_read, NULL, &fds_error, &timeout); + if (benchmp_sigchld_received + || benchmp_sigterm_received + || FD_ISSET(response, &fds_error)) + { +#ifdef _DEBUG + fprintf(stderr, "benchmp_parent: results, benchmp_sigchld_received=%d\n", benchmp_sigchld_received); +#endif + goto error_exit; + } + if (!FD_ISSET(response, &fds_read)) { + continue; + } + + bytes_read = read(response, buf, n); + if (bytes_read < 0) { +#ifdef _DEBUG + fprintf(stderr, "benchmp_parent: results, bytes_read=%d, %s\n", bytes_read, strerror(errno)); +#endif + goto error_exit; + } + } + for (j = 0; j < results->N; ++j) { + insertsort(results->v[j].u, + results->v[j].n, merged_results); + } + } + + /* we allow children to die now, without it causing an error */ + signal(SIGCHLD, SIG_DFL); + + /* send 'exit' signals */ + write(exit_signal, results, parallel * sizeof(char)); + + /* Compute median time; iterations is constant! */ + set_results(merged_results); + + goto cleanup_exit; +error_exit: +#ifdef _DEBUG + fprintf(stderr, "benchmp_parent: error_exit!\n"); +#endif + signal(SIGCHLD, SIG_DFL); + for (i = 0; i < parallel; ++i) { + kill(pids[i], SIGTERM); + waitpid(pids[i], NULL, 0); + } + free(merged_results); +cleanup_exit: + close(response); + close(start_signal); + close(result_signal); + close(exit_signal); + + if (results) free(results); + if (signals) free(signals); +} + + +typedef enum { warmup, timing_interval, cooldown } benchmp_state; + +typedef struct { + benchmp_state state; + benchmp_f initialize; + benchmp_f benchmark; + benchmp_f cleanup; + int childid; + int response; + int start_signal; + int result_signal; + int exit_signal; + int enough; + iter_t iterations; + int parallel; + int repetitions; + void* cookie; + iter_t iterations_batch; + int need_warmup; + long i; + int r_size; + result_t* r; +} benchmp_child_state; + +static benchmp_child_state _benchmp_child_state; + +int +benchmp_childid() +{ + return _benchmp_child_state.childid; +} + +void +benchmp_child_sigchld(int signo) +{ +#ifdef _DEBUG + fprintf(stderr, "benchmp_child_sigchld handler\n"); +#endif + if (_benchmp_child_state.cleanup) { + signal(SIGCHLD, SIG_DFL); + (*_benchmp_child_state.cleanup)(0, &_benchmp_child_state); + } + exit(1); +} + +void +benchmp_child_sigterm(int signo) +{ + signal(SIGTERM, SIG_IGN); + if (_benchmp_child_state.cleanup) { + void (*sig)(int) = signal(SIGCHLD, SIG_DFL); + if (sig != benchmp_child_sigchld && sig != SIG_DFL) { + signal(SIGCHLD, sig); + } + (*_benchmp_child_state.cleanup)(0, &_benchmp_child_state); + } + exit(0); +} + +void* +benchmp_getstate() +{ + return ((void*)&_benchmp_child_state); +} + +void +benchmp_child(benchmp_f initialize, + benchmp_f benchmark, + benchmp_f cleanup, + int childid, + int response, + int start_signal, + int result_signal, + int exit_signal, + int enough, + iter_t iterations, + int parallel, + int repetitions, + void* cookie + ) +{ + iter_t iterations_batch = (parallel > 1) ? get_n() : 1; + double result = 0.; + double usecs; + long i = 0; + int need_warmup; + fd_set fds; + struct timeval timeout; + + _benchmp_child_state.state = warmup; + _benchmp_child_state.initialize = initialize; + _benchmp_child_state.benchmark = benchmark; + _benchmp_child_state.cleanup = cleanup; + _benchmp_child_state.childid = childid; + _benchmp_child_state.response = response; + _benchmp_child_state.start_signal = start_signal; + _benchmp_child_state.result_signal = result_signal; + _benchmp_child_state.exit_signal = exit_signal; + _benchmp_child_state.enough = enough; + _benchmp_child_state.iterations = iterations; + _benchmp_child_state.iterations_batch = iterations_batch; + _benchmp_child_state.parallel = parallel; + _benchmp_child_state.repetitions = repetitions; + _benchmp_child_state.cookie = cookie; + _benchmp_child_state.need_warmup = 1; + _benchmp_child_state.i = 0; + _benchmp_child_state.r_size = sizeof_result(repetitions); + _benchmp_child_state.r = (result_t*)malloc(_benchmp_child_state.r_size); + + if (!_benchmp_child_state.r) return; + insertinit(_benchmp_child_state.r); + set_results(_benchmp_child_state.r); + + need_warmup = 1; + timeout.tv_sec = 0; + timeout.tv_usec = 0; + + if (benchmp_sigchld_handler != SIG_DFL) { + signal(SIGCHLD, benchmp_sigchld_handler); + } else { + signal(SIGCHLD, benchmp_child_sigchld); + } + + if (initialize) + (*initialize)(0, cookie); + + if (benchmp_sigterm_handler != SIG_DFL) { + signal(SIGTERM, benchmp_sigterm_handler); + } else { + signal(SIGTERM, benchmp_child_sigterm); + } + if (benchmp_sigterm_received) + benchmp_child_sigterm(SIGTERM); + + /* start experiments, collecting results */ + insertinit(_benchmp_child_state.r); + + while (1) { + (*benchmark)(benchmp_interval(&_benchmp_child_state), cookie); + } +} + +iter_t +benchmp_interval(void* _state) +{ + char c; + iter_t iterations; + double result; + fd_set fds; + struct timeval timeout; + benchmp_child_state* state = (benchmp_child_state*)_state; + + iterations = (state->state == timing_interval ? state->iterations : state->iterations_batch); + + if (!state->need_warmup) { + result = stop(0,0); + if (state->cleanup) { + if (benchmp_sigchld_handler == SIG_DFL) + signal(SIGCHLD, SIG_DFL); + (*state->cleanup)(iterations, state->cookie); + } + save_n(state->iterations); + result -= t_overhead() + get_n() * l_overhead(); + settime(result >= 0. ? (uint64)result : 0.); + } + + /* if the parent died, then give up */ + if (getppid() == 1 && state->cleanup) { + if (benchmp_sigchld_handler == SIG_DFL) + signal(SIGCHLD, SIG_DFL); + (*state->cleanup)(0, state->cookie); + exit(0); + } + + timeout.tv_sec = 0; + timeout.tv_usec = 0; + FD_ZERO(&fds); + + switch (state->state) { + case warmup: + iterations = state->iterations_batch; + FD_SET(state->start_signal, &fds); + select(state->start_signal+1, &fds, NULL, + NULL, &timeout); + if (FD_ISSET(state->start_signal, &fds)) { + state->state = timing_interval; + read(state->start_signal, &c, sizeof(char)); + iterations = state->iterations; + } + if (state->need_warmup) { + state->need_warmup = 0; + /* send 'ready' */ + write(state->response, &c, sizeof(char)); + } + break; + case timing_interval: + iterations = state->iterations; + if (state->parallel > 1 || result > 0.95 * state->enough) { + insertsort(gettime(), get_n(), get_results()); + state->i++; + /* we completed all the experiments, return results */ + if (state->i >= state->repetitions) { + state->state = cooldown; + } + } + if (state->parallel == 1 + && (result < 0.99 * state->enough || result > 1.2 * state->enough)) { + if (result > 150.) { + double tmp = iterations / result; + tmp *= 1.1 * state->enough; + iterations = (iter_t)(tmp + 1); + } else { + iterations <<= 3; + if (iterations > 1<<27 + || result < 0. && iterations > 1<<20) { + state->state = cooldown; + } + } + } + state->iterations = iterations; + if (state->state == cooldown) { + /* send 'done' */ + write(state->response, (void*)&c, sizeof(char)); + iterations = state->iterations_batch; + } + break; + case cooldown: + iterations = state->iterations_batch; + FD_SET(state->result_signal, &fds); + select(state->result_signal+1, &fds, NULL, NULL, &timeout); + if (FD_ISSET(state->result_signal, &fds)) { + /* + * At this point all children have stopped their + * measurement loops, so we can block waiting for + * the parent to tell us to send our results back. + * From this point on, we will do no more "work". + */ + read(state->result_signal, (void*)&c, sizeof(char)); + write(state->response, (void*)get_results(), state->r_size); + if (state->cleanup) { + if (benchmp_sigchld_handler == SIG_DFL) + signal(SIGCHLD, SIG_DFL); + (*state->cleanup)(0, state->cookie); + } + + /* Now wait for signal to exit */ + read(state->exit_signal, (void*)&c, sizeof(char)); + exit(0); + } + }; + if (state->initialize) { + (*state->initialize)(iterations, state->cookie); + } + start(0); + return (iterations); +} + + +/* + * Redirect output someplace else. + */ +void +timing(FILE *out) +{ + ftiming = out; +} + +/* + * Start timing now. + */ +void +start(struct timeval *tv) +{ + if (tv == NULL) { + tv = &start_tv; + } +#ifdef RUSAGE + getrusage(RUSAGE_SELF, &ru_start); +#endif + (void) gettimeofday(tv, (struct timezone *) 0); +} + +/* + * Stop timing and return real time in microseconds. + */ +uint64 +stop(struct timeval *begin, struct timeval *end) +{ + if (end == NULL) { + end = &stop_tv; + } + (void) gettimeofday(end, (struct timezone *) 0); +#ifdef RUSAGE + getrusage(RUSAGE_SELF, &ru_stop); +#endif + + if (begin == NULL) { + begin = &start_tv; + } + return (tvdelta(begin, end)); +} + +uint64 +now(void) +{ + struct timeval t; + uint64 m; + + (void) gettimeofday(&t, (struct timezone *) 0); + m = t.tv_sec; + m *= 1000000; + m += t.tv_usec; + return (m); +} + +double +Now(void) +{ + struct timeval t; + + (void) gettimeofday(&t, (struct timezone *) 0); + return (t.tv_sec * 1000000.0 + t.tv_usec); +} + +uint64 +delta(void) +{ + static struct timeval last; + struct timeval t; + struct timeval diff; + uint64 m; + + (void) gettimeofday(&t, (struct timezone *) 0); + if (last.tv_usec) { + tvsub(&diff, &t, &last); + last = t; + m = diff.tv_sec; + m *= 1000000; + m += diff.tv_usec; + return (m); + } else { + last = t; + return (0); + } +} + +double +Delta(void) +{ + struct timeval t; + struct timeval diff; + + (void) gettimeofday(&t, (struct timezone *) 0); + tvsub(&diff, &t, &start_tv); + return (diff.tv_sec + diff.tv_usec / 1000000.0); +} + +void +save_n(uint64 n) +{ + iterations = n; +} + +uint64 +get_n(void) +{ + return (iterations); +} + +/* + * Make the time spend be usecs. + */ +void +settime(uint64 usecs) +{ + bzero((void*)&start_tv, sizeof(start_tv)); + stop_tv.tv_sec = usecs / 1000000; + stop_tv.tv_usec = usecs % 1000000; +} + +void +bandwidth(uint64 bytes, uint64 times, int verbose) +{ + struct timeval tdiff; + double mb, secs; + + tvsub(&tdiff, &stop_tv, &start_tv); + secs = tdiff.tv_sec; + secs *= 1000000; + secs += tdiff.tv_usec; + secs /= 1000000; + secs /= times; + mb = bytes / MB; + if (!ftiming) ftiming = stderr; + if (verbose) { + (void) fprintf(ftiming, + "%.4f MB in %.4f secs, %.4f MB/sec\n", + mb, secs, mb/secs); + } else { + if (mb < 1) { + (void) fprintf(ftiming, "%.6f ", mb); + } else { + (void) fprintf(ftiming, "%.2f ", mb); + } + if (mb / secs < 1) { + (void) fprintf(ftiming, "%.6f\n", mb/secs); + } else { + (void) fprintf(ftiming, "%.2f\n", mb/secs); + } + } +} + +void +kb(uint64 bytes) +{ + struct timeval td; + double s, bs; + + tvsub(&td, &stop_tv, &start_tv); + s = td.tv_sec + td.tv_usec / 1000000.0; + bs = bytes / nz(s); + if (s == 0.0) return; + if (!ftiming) ftiming = stderr; + (void) fprintf(ftiming, "%.0f KB/sec\n", bs / KB); +} + +void +mb(uint64 bytes) +{ + struct timeval td; + double s, bs; + + tvsub(&td, &stop_tv, &start_tv); + s = td.tv_sec + td.tv_usec / 1000000.0; + bs = bytes / nz(s); + if (s == 0.0) return; + if (!ftiming) ftiming = stderr; + (void) fprintf(ftiming, "%.2f MB/sec\n", bs / MB); +} + +void +latency(uint64 xfers, uint64 size) +{ + struct timeval td; + double s; + + if (!ftiming) ftiming = stderr; + tvsub(&td, &stop_tv, &start_tv); + s = td.tv_sec + td.tv_usec / 1000000.0; + if (s == 0.0) return; + if (xfers > 1) { + fprintf(ftiming, "%d %dKB xfers in %.2f secs, ", + (int) xfers, (int) (size / KB), s); + } else { + fprintf(ftiming, "%.1fKB in ", size / KB); + } + if ((s * 1000 / xfers) > 100) { + fprintf(ftiming, "%.0f millisec%s, ", + s * 1000 / xfers, xfers > 1 ? "/xfer" : "s"); + } else { + fprintf(ftiming, "%.4f millisec%s, ", + s * 1000 / xfers, xfers > 1 ? "/xfer" : "s"); + } + if (((xfers * size) / (MB * s)) > 1) { + fprintf(ftiming, "%.2f MB/sec\n", (xfers * size) / (MB * s)); + } else { + fprintf(ftiming, "%.2f KB/sec\n", (xfers * size) / (KB * s)); + } +} + +void +context(uint64 xfers) +{ + struct timeval td; + double s; + + tvsub(&td, &stop_tv, &start_tv); + s = td.tv_sec + td.tv_usec / 1000000.0; + if (s == 0.0) return; + if (!ftiming) ftiming = stderr; + fprintf(ftiming, + "%d context switches in %.2f secs, %.0f microsec/switch\n", + (int)xfers, s, s * 1000000 / xfers); +} + +void +nano(char *s, uint64 n) +{ + struct timeval td; + double micro; + + tvsub(&td, &stop_tv, &start_tv); + micro = td.tv_sec * 1000000 + td.tv_usec; + micro *= 1000; + if (micro == 0.0) return; + if (!ftiming) ftiming = stderr; + fprintf(ftiming, "%s: %.2f nanoseconds\n", s, micro / n); +} + +void +micro(char *s, uint64 n) +{ + struct timeval td; + double micro; + + tvsub(&td, &stop_tv, &start_tv); + micro = td.tv_sec * 1000000 + td.tv_usec; + micro /= n; + if (micro == 0.0) return; + if (!ftiming) ftiming = stderr; + fprintf(ftiming, "%s: %.4f microseconds\n", s, micro); +#if 0 + if (micro >= 100) { + fprintf(ftiming, "%s: %.1f microseconds\n", s, micro); + } else if (micro >= 10) { + fprintf(ftiming, "%s: %.3f microseconds\n", s, micro); + } else { + fprintf(ftiming, "%s: %.4f microseconds\n", s, micro); + } +#endif +} + +void +micromb(uint64 sz, uint64 n) +{ + struct timeval td; + double mb, micro; + + tvsub(&td, &stop_tv, &start_tv); + micro = td.tv_sec * 1000000 + td.tv_usec; + micro /= n; + mb = sz; + mb /= MB; + if (micro == 0.0) return; + if (!ftiming) ftiming = stderr; + if (micro >= 10) { + fprintf(ftiming, "%.6f %.0f\n", mb, micro); + } else { + fprintf(ftiming, "%.6f %.3f\n", mb, micro); + } +} + +void +milli(char *s, uint64 n) +{ + struct timeval td; + uint64 milli; + + tvsub(&td, &stop_tv, &start_tv); + milli = td.tv_sec * 1000 + td.tv_usec / 1000; + milli /= n; + if (milli == 0.0) return; + if (!ftiming) ftiming = stderr; + fprintf(ftiming, "%s: %d milliseconds\n", s, (int)milli); +} + +void +ptime(uint64 n) +{ + struct timeval td; + double s; + + tvsub(&td, &stop_tv, &start_tv); + s = td.tv_sec + td.tv_usec / 1000000.0; + if (s == 0.0) return; + if (!ftiming) ftiming = stderr; + fprintf(ftiming, + "%d in %.2f secs, %.0f microseconds each\n", + (int)n, s, s * 1000000 / n); +} + +uint64 +tvdelta(struct timeval *start, struct timeval *stop) +{ + struct timeval td; + uint64 usecs; + + tvsub(&td, stop, start); + usecs = td.tv_sec; + usecs *= 1000000; + usecs += td.tv_usec; + return (usecs); +} + +void +tvsub(struct timeval * tdiff, struct timeval * t1, struct timeval * t0) +{ + tdiff->tv_sec = t1->tv_sec - t0->tv_sec; + tdiff->tv_usec = t1->tv_usec - t0->tv_usec; + if (tdiff->tv_usec < 0 && tdiff->tv_sec > 0) { + tdiff->tv_sec--; + tdiff->tv_usec += 1000000; + assert(tdiff->tv_usec >= 0); + } + + /* time shouldn't go backwards!!! */ + if (tdiff->tv_usec < 0 || t1->tv_sec < t0->tv_sec) { + tdiff->tv_sec = 0; + tdiff->tv_usec = 0; + } +} + +uint64 +gettime(void) +{ + return (tvdelta(&start_tv, &stop_tv)); +} + +double +timespent(void) +{ + struct timeval td; + + tvsub(&td, &stop_tv, &start_tv); + return (td.tv_sec + td.tv_usec / 1000000.0); +} + +static char p64buf[10][20]; +static int n; + +char * +p64(uint64 big) +{ + char *s = p64buf[n++]; + + if (n == 10) n = 0; +#ifdef linux + { + int *a = (int*)&big; + + if (a[1]) { + sprintf(s, "0x%x%08x", a[1], a[0]); + } else { + sprintf(s, "0x%x", a[0]); + } + } +#endif +#ifdef __sgi + sprintf(s, "0x%llx", big); +#endif + return (s); +} + +char * +p64sz(uint64 big) +{ + double d = big; + char *tags = " KMGTPE"; + int t = 0; + char *s = p64buf[n++]; + + if (n == 10) n = 0; + while (d > 512) t++, d /= 1024; + if (d == 0) { + return ("0"); + } + if (d < 100) { + sprintf(s, "%.4f%c", d, tags[t]); + } else { + sprintf(s, "%.2f%c", d, tags[t]); + } + return (s); +} + +char +last(char *s) +{ + while (*s++) + ; + return (s[-2]); +} + +uint64 +bytes(char *s) +{ + uint64 n; + + if (sscanf(s, "%llu", &n) < 1) + return (0); + + if ((last(s) == 'k') || (last(s) == 'K')) + n *= 1024; + if ((last(s) == 'm') || (last(s) == 'M')) + n *= (1024 * 1024); + return (n); +} + +void +use_int(int result) { use_result_dummy += result; } + +void +use_pointer(void *result) { use_result_dummy += (long)result; } + +int +sizeof_result(int repetitions) +{ + if (repetitions <= TRIES) + return (sizeof(result_t)); + return (sizeof(result_t) + (repetitions - TRIES) * sizeof(value_t)); +} + +void +insertinit(result_t *r) +{ + int i; + + r->N = 0; +} + +/* biggest to smallest */ +void +insertsort(uint64 u, uint64 n, result_t *r) +{ + int i, j; + + if (u == 0) return; + +#ifdef _DEBUG + fprintf(stderr, "\tinsertsort(%llu, %llu, %p)\n", u, n, r); +#endif /* _DEBUG */ + for (i = 0; i < r->N; ++i) { + if (u/(double)n > r->v[i].u/(double)r->v[i].n) { + for (j = r->N; j > i; --j) { + r->v[j] = r->v[j - 1]; + } + break; + } + } + r->v[i].u = u; + r->v[i].n = n; + r->N++; +} + +static result_t _results; +static result_t* results = &_results; + +result_t* +get_results() +{ + return (results); +} + +void +set_results(result_t *r) +{ + results = r; + save_median(); +} + +void +save_minimum() +{ + if (results->N == 0) { + save_n(1); + settime(0); + } else { + save_n(results->v[results->N - 1].n); + settime(results->v[results->N - 1].u); + } +} + +void +save_median() +{ + int i = results->N / 2; + uint64 u, n; + + if (results->N == 0) { + n = 1; + u = 0; + } else if (results->N % 2) { + n = results->v[i].n; + u = results->v[i].u; + } else { + n = (results->v[i].n + results->v[i-1].n) / 2; + u = (results->v[i].u + results->v[i-1].u) / 2; + } +#ifdef _DEBUG + fprintf(stderr, "save_median: N=%d, n=%lu, u=%lu\n", results->N, (unsigned long)n, (unsigned long)u); +#endif /* _DEBUG */ + save_n(n); settime(u); +} + +/* + * The inner loop tracks bench.h but uses a different results array. + */ +static long * +one_op(register long *p) +{ + BENCH_INNER(p = (long *)*p;, 0); + return (p); +} + +static long * +two_op(register long *p) +{ + BENCH_INNER(p = (long *)*p; p = (long*)*p;, 0); + return (p); +} + +static long *p = (long *)&p; +static long *q = (long *)&q; + +double +l_overhead(void) +{ + int i; + uint64 N_save, u_save; + static double overhead; + static int initialized = 0; + result_t one, two, *r_save; + + init_timing(); + if (initialized) return (overhead); + + initialized = 1; + if (getenv("LOOP_O")) { + overhead = atof(getenv("LOOP_O")); + } else { + r_save = get_results(); N_save = get_n(); u_save = gettime(); + insertinit(&one); + insertinit(&two); + for (i = 0; i < TRIES; ++i) { + use_pointer((void*)one_op(p)); + if (gettime() > t_overhead()) + insertsort(gettime() - t_overhead(), get_n(), &one); + use_pointer((void *)two_op(p)); + if (gettime() > t_overhead()) + insertsort(gettime() - t_overhead(), get_n(), &two); + } + /* + * u1 = (n1 * (overhead + work)) + * u2 = (n2 * (overhead + 2 * work)) + * ==> overhead = 2. * u1 / n1 - u2 / n2 + */ + set_results(&one); + save_minimum(); + overhead = 2. * gettime() / (double)get_n(); + + set_results(&two); + save_minimum(); + overhead -= gettime() / (double)get_n(); + + if (overhead < 0.) overhead = 0.; /* Gag */ + + set_results(r_save); save_n(N_save); settime(u_save); + } + return (overhead); +} + +/* + * Figure out the timing overhead. This has to track bench.h + */ +uint64 +t_overhead(void) +{ + uint64 N_save, u_save; + static int initialized = 0; + static uint64 overhead = 0; + struct timeval tv; + result_t *r_save; + + init_timing(); + if (initialized) return (overhead); + + initialized = 1; + if (getenv("TIMING_O")) { + overhead = atof(getenv("TIMING_O")); + } else if (get_enough(0) <= 50000) { + /* it is not in the noise, so compute it */ + int i; + result_t r; + + r_save = get_results(); N_save = get_n(); u_save = gettime(); + insertinit(&r); + for (i = 0; i < TRIES; ++i) { + BENCH_INNER(gettimeofday(&tv, 0), 0); + insertsort(gettime(), get_n(), &r); + } + set_results(&r); + save_minimum(); + overhead = gettime() / get_n(); + + set_results(r_save); save_n(N_save); settime(u_save); + } + return (overhead); +} + +/* + * Figure out how long to run it. + * If enough == 0, then they want us to figure it out. + * If enough is !0 then return it unless we think it is too short. + */ +static int long_enough; +static int compute_enough(); + +int +get_enough(int e) +{ + init_timing(); + return (long_enough > e ? long_enough : e); +} + + +static void +init_timing(void) +{ + static int done = 0; + + if (done) return; + done = 1; + long_enough = compute_enough(); + t_overhead(); + l_overhead(); +} + +typedef long TYPE; + +static TYPE ** +enough_duration(register long N, register TYPE ** p) +{ +#define ENOUGH_DURATION_TEN(one) one one one one one one one one one one + while (N-- > 0) { + ENOUGH_DURATION_TEN(p = (TYPE **) *p;); + } + return (p); +} + +static uint64 +duration(long N) +{ + uint64 usecs; + TYPE *x = (TYPE *)&x; + TYPE **p = (TYPE **)&x; + + start(0); + p = enough_duration(N, p); + usecs = stop(0, 0); + use_pointer((void *)p); + return (usecs); +} + +/* + * find the minimum time that work "N" takes in "tries" tests + */ +static uint64 +time_N(iter_t N) +{ + int i; + uint64 usecs; + result_t r, *r_save; + + r_save = get_results(); + insertinit(&r); + for (i = 1; i < TRIES; ++i) { + usecs = duration(N); + insertsort(usecs, N, &r); + } + set_results(&r); + save_minimum(); + usecs = gettime(); + set_results(r_save); + return (usecs); +} + +/* + * return the amount of work needed to run "enough" microseconds + */ +static iter_t +find_N(int enough) +{ + int tries; + static iter_t N = 10000; + static uint64 usecs = 0; + + if (!usecs) usecs = time_N(N); + + for (tries = 0; tries < 10; ++tries) { + if (0.98 * enough < usecs && usecs < 1.02 * enough) + return (N); + if (usecs < 1000) + N *= 10; + else { + double n = N; + + n /= usecs; + n *= enough; + N = n + 1; + } + usecs = time_N(N); + } + return (0); +} + +/* + * We want to verify that small modifications proportionally affect the runtime + */ +static double test_points[] = {1.015, 1.02, 1.035}; +static int +test_time(int enough) +{ + int i; + iter_t N; + uint64 usecs, expected, baseline, diff; + + if ((N = find_N(enough)) == 0) + return (0); + + baseline = time_N(N); + + for (i = 0; i < sizeof(test_points) / sizeof(double); ++i) { + usecs = time_N((int)((double) N * test_points[i])); + expected = (uint64)((double)baseline * test_points[i]); + diff = expected > usecs ? expected - usecs : usecs - expected; + if (diff / (double)expected > 0.0025) + return (0); + } + return (1); +} + + +/* + * We want to find the smallest timing interval that has accurate timing + */ +static int possibilities[] = { 5000, 10000, 50000, 100000 }; +static int +compute_enough() +{ + int i; + + if (getenv("ENOUGH")) { + return (atoi(getenv("ENOUGH"))); + } + for (i = 0; i < sizeof(possibilities) / sizeof(int); ++i) { + if (test_time(possibilities[i])) + return (possibilities[i]); + } + + /* + * if we can't find a timing interval that is sufficient, + * then use SHORT as a default. + */ + return (SHORT); +} + +/* + * This stuff isn't really lib_timing, but ... + */ +void +morefds(void) +{ +#ifdef RLIMIT_NOFILE + struct rlimit r; + + getrlimit(RLIMIT_NOFILE, &r); + r.rlim_cur = r.rlim_max; + setrlimit(RLIMIT_NOFILE, &r); +#endif +} + +/* analogous to bzero, bcopy, etc., except that it just reads + * data into the processor + */ +long +bread(void* buf, long nbytes) +{ + long sum = 0; + register long *p, *next; + register char *end; + + p = (long*)buf; + end = (char*)buf + nbytes; + for (next = p + 128; (void*)next <= (void*)end; p = next, next += 128) { + sum += + p[0]+p[1]+p[2]+p[3]+p[4]+p[5]+p[6]+p[7]+ + p[8]+p[9]+p[10]+p[11]+p[12]+p[13]+p[14]+ + p[15]+p[16]+p[17]+p[18]+p[19]+p[20]+p[21]+ + p[22]+p[23]+p[24]+p[25]+p[26]+p[27]+p[28]+ + p[29]+p[30]+p[31]+p[32]+p[33]+p[34]+p[35]+ + p[36]+p[37]+p[38]+p[39]+p[40]+p[41]+p[42]+ + p[43]+p[44]+p[45]+p[46]+p[47]+p[48]+p[49]+ + p[50]+p[51]+p[52]+p[53]+p[54]+p[55]+p[56]+ + p[57]+p[58]+p[59]+p[60]+p[61]+p[62]+p[63]+ + p[64]+p[65]+p[66]+p[67]+p[68]+p[69]+p[70]+ + p[71]+p[72]+p[73]+p[74]+p[75]+p[76]+p[77]+ + p[78]+p[79]+p[80]+p[81]+p[82]+p[83]+p[84]+ + p[85]+p[86]+p[87]+p[88]+p[89]+p[90]+p[91]+ + p[92]+p[93]+p[94]+p[95]+p[96]+p[97]+p[98]+ + p[99]+p[100]+p[101]+p[102]+p[103]+p[104]+ + p[105]+p[106]+p[107]+p[108]+p[109]+p[110]+ + p[111]+p[112]+p[113]+p[114]+p[115]+p[116]+ + p[117]+p[118]+p[119]+p[120]+p[121]+p[122]+ + p[123]+p[124]+p[125]+p[126]+p[127]; + } + for (next = p + 16; (void*)next <= (void*)end; p = next, next += 16) { + sum += + p[0]+p[1]+p[2]+p[3]+p[4]+p[5]+p[6]+p[7]+ + p[8]+p[9]+p[10]+p[11]+p[12]+p[13]+p[14]+ + p[15]; + } + for (next = p + 1; (void*)next <= (void*)end; p = next, next++) { + sum += *p; + } + return sum; +} + +void +touch(char *buf, int nbytes) +{ + static psize; + + if (!psize) { + psize = getpagesize(); + } + while (nbytes > 0) { + *buf = 1; + buf += psize; + nbytes -= psize; + } +} + +size_t* +permutation(int max, int scale) +{ + size_t i, v; + static size_t r = 0; + size_t* result = (size_t*)malloc(max * sizeof(size_t)); + + if (result == NULL) return NULL; + + for (i = 0; i < max; ++i) { + result[i] = i * (size_t)scale; + } + + if (r == 0) + r = (getpid()<<6) ^ getppid() ^ rand() ^ (rand()<<10); + + /* randomize the sequence */ + for (i = max - 1; i > 0; --i) { + r = (r << 1) ^ rand(); + v = result[r % (i + 1)]; + result[r % (i + 1)] = result[i]; + result[i] = v; + } + +#ifdef _DEBUG + fprintf(stderr, "permutation(%d): {", max); + for (i = 0; i < max; ++i) { + fprintf(stderr, "%d", result[i]); + if (i < max - 1) + fprintf(stderr, ","); + } + fprintf(stderr, "}\n"); + fflush(stderr); +#endif /* _DEBUG */ + + return (result); +} + +int +cp(char* src, char* dst, mode_t mode) +{ + int sfd, dfd; + char buf[8192]; + ssize_t size; + + if ((sfd = open(src, O_RDONLY)) < 0) { + return -1; + } + if ((dfd = open(dst, O_CREAT|O_TRUNC|O_RDWR, mode)) < 0) { + return -1; + } + while ((size = read(sfd, buf, 8192)) > 0) { + if (write(dfd, buf, size) < size) return -1; + } + fsync(dfd); + close(sfd); + close(dfd); +} + +#if defined(hpux) || defined(__hpux) +int +getpagesize() +{ + return (sysconf(_SC_PAGE_SIZE)); +} +#endif + +#ifdef WIN32 +int +getpagesize() +{ + SYSTEM_INFO s; + + GetSystemInfo(&s); + return ((int)s.dwPageSize); +} + +LARGE_INTEGER +getFILETIMEoffset() +{ + SYSTEMTIME s; + FILETIME f; + LARGE_INTEGER t; + + s.wYear = 1970; + s.wMonth = 1; + s.wDay = 1; + s.wHour = 0; + s.wMinute = 0; + s.wSecond = 0; + s.wMilliseconds = 0; + SystemTimeToFileTime(&s, &f); + t.QuadPart = f.dwHighDateTime; + t.QuadPart <<= 32; + t.QuadPart |= f.dwLowDateTime; + return (t); +} + +int +gettimeofday(struct timeval *tv, struct timezone *tz) +{ + LARGE_INTEGER t; + FILETIME f; + double microseconds; + static LARGE_INTEGER offset; + static double frequencyToMicroseconds; + static int initialized = 0; + static BOOL usePerformanceCounter = 0; + + if (!initialized) { + LARGE_INTEGER performanceFrequency; + initialized = 1; + usePerformanceCounter = QueryPerformanceFrequency(&performanceFrequency); + if (usePerformanceCounter) { + QueryPerformanceCounter(&offset); + frequencyToMicroseconds = (double)performanceFrequency.QuadPart / 1000000.; + } else { + offset = getFILETIMEoffset(); + frequencyToMicroseconds = 10.; + } + } + if (usePerformanceCounter) QueryPerformanceCounter(&t); + else { + GetSystemTimeAsFileTime(&f); + t.QuadPart = f.dwHighDateTime; + t.QuadPart <<= 32; + t.QuadPart |= f.dwLowDateTime; + } + + t.QuadPart -= offset.QuadPart; + microseconds = (double)t.QuadPart / frequencyToMicroseconds; + t.QuadPart = microseconds; + tv->tv_sec = t.QuadPart / 1000000; + tv->tv_usec = t.QuadPart % 1000000; + return (0); +} +#endif diff --git a/performance/lmbench3/src/lib_udp.c b/performance/lmbench3/src/lib_udp.c new file mode 100644 index 0000000..4e4a5a6 --- /dev/null +++ b/performance/lmbench3/src/lib_udp.c @@ -0,0 +1,96 @@ +/* + * udp_lib.c - routines for managing UDP connections + * + * %W% %G% + * + * Copyright (c) 1994 Larry McVoy. + */ +#define _LIB /* bench.h needs this */ +#include "bench.h" + +/* + * Get a UDP socket, bind it, figure out the port, + * and advertise the port as program "prog". + * + * XXX - it would be nice if you could advertise ascii strings. + */ +int +udp_server(u_long prog, int rdwr) +{ + int sock; + struct sockaddr_in s; + + if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) { + perror("socket"); + exit(1); + } + sock_optimize(sock, rdwr); + bzero((void*)&s, sizeof(s)); + s.sin_family = AF_INET; +#ifdef NO_PORTMAPPER + s.sin_port = htons(prog); +#endif + if (bind(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { + perror("bind"); + exit(2); + } +#ifndef NO_PORTMAPPER + (void)pmap_unset(prog, (u_long)1); + if (!pmap_set(prog, (u_long)1, (u_long)IPPROTO_UDP, + (unsigned short)sockport(sock))) { + perror("pmap_set"); + exit(5); + } +#endif + return (sock); +} + +/* + * Unadvertise the socket + */ +void +udp_done(int prog) +{ + (void)pmap_unset((u_long)prog, (u_long)1); +} + +/* + * "Connect" to the UCP socket advertised as "prog" on "host" and + * return the connected socket. + */ +int +udp_connect(char *host, u_long prog, int rdwr) +{ + struct hostent *h; + struct sockaddr_in sin; + int sock; + u_short port; + + if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) { + perror("socket"); + exit(1); + } + sock_optimize(sock, rdwr); + if (!(h = gethostbyname(host))) { + perror(host); + exit(2); + } + bzero((void *) &sin, sizeof(sin)); + sin.sin_family = AF_INET; + bcopy((void*)h->h_addr, (void *) &sin.sin_addr, h->h_length); +#ifdef NO_PORTMAPPER + sin.sin_port = htons(prog); +#else + port = pmap_getport(&sin, prog, (u_long)1, IPPROTO_UDP); + if (!port) { + perror("lib UDP: No port found"); + exit(3); + } + sin.sin_port = htons(port); +#endif + if (connect(sock, (struct sockaddr*)&sin, sizeof(sin)) < 0) { + perror("connect"); + exit(4); + } + return (sock); +} diff --git a/performance/lmbench3/src/lib_udp.h b/performance/lmbench3/src/lib_udp.h new file mode 100644 index 0000000..d414d52 --- /dev/null +++ b/performance/lmbench3/src/lib_udp.h @@ -0,0 +1,12 @@ +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netdb.h> +#include <arpa/inet.h> + +int udp_server(u_long prog, int rdwr); +void udp_done(int prog); +int udp_connect(char *host, u_long prog, int rdwr); +void sock_optimize(int sock, int rdwr); +int sockport(int); + diff --git a/performance/lmbench3/src/lib_unix.c b/performance/lmbench3/src/lib_unix.c new file mode 100644 index 0000000..bd588cd --- /dev/null +++ b/performance/lmbench3/src/lib_unix.c @@ -0,0 +1,97 @@ +/* + * unix_lib.c - routines for managing UNIX connections. + * + * Positive port/program numbers are RPC ports, negative ones are UNIX ports. + * + * Copyright (c) 1994-1996 Larry McVoy. + */ +#define _LIB /* bench.h needs this */ +#include "bench.h" + +/* + * Get a UNIX socket, bind it. + */ +int +unix_server(char *path) +{ + int sock; + struct sockaddr_un s; + +#ifdef LIBUNIX_VERBOSE + fprintf(stderr, "unix_server(%s, %u)\n", prog, rdwr); +#endif + if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + bzero((void*)&s, sizeof(s)); + s.sun_family = AF_UNIX; + strcpy(s.sun_path, path); + if (bind(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { + perror("bind"); + exit(2); + } + if (listen(sock, 100) < 0) { + perror("listen"); + exit(4); + } + return (sock); +} + +/* + * Unadvertise the socket + */ +int +unix_done(int sock, char *path) +{ + close(sock); + unlink(path); + return (0); +} + +/* + * Accept a connection and return it + */ +int +unix_accept(int sock) +{ + struct sockaddr_un s; + int newsock, namelen; + + namelen = sizeof(s); + bzero((void*)&s, namelen); + +retry: + if ((newsock = accept(sock, (struct sockaddr*)&s, &namelen)) < 0) { + if (errno == EINTR) + goto retry; + perror("accept"); + exit(6); + } + return (newsock); +} + +/* + * Connect to the UNIX socket advertised as "path" and + * return the connected socket. + */ +int +unix_connect(char *path) +{ + struct sockaddr_un s; + int sock; + + if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + perror("socket"); + exit(1); + } + bzero((void*)&s, sizeof(s)); + s.sun_family = AF_UNIX; + strcpy(s.sun_path, path); + if (connect(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { + perror("connect"); + exit(4); + } + return (sock); +} + diff --git a/performance/lmbench3/src/lib_unix.h b/performance/lmbench3/src/lib_unix.h new file mode 100644 index 0000000..859e472 --- /dev/null +++ b/performance/lmbench3/src/lib_unix.h @@ -0,0 +1,8 @@ +/* lib_unix.c */ +#ifndef _LIB_UNIX_H_ +#define _LIB_UNIX_H_ +int unix_server(char *path); +int unix_done(int sock, char *path); +int unix_accept(int sock); +int unix_connect(char *path); +#endif diff --git a/performance/lmbench3/src/line.c b/performance/lmbench3/src/line.c new file mode 100644 index 0000000..3b5314d --- /dev/null +++ b/performance/lmbench3/src/line.c @@ -0,0 +1,68 @@ +/* + * line.c - guess the cache line size + * + * usage: line + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +/* + * Assumptions: + * + * 1) Cache lines are a multiple of pointer-size words + * 2) Cache lines are no larger than 1/4 a page size + * 3) Pages are an even multiple of cache lines + */ +int +main(int ac, char **av) +{ + int i, j, l; + int verbose = 0; + int warmup = 0; + int repetitions = TRIES; + int c; + size_t maxlen = 64 * 1024 * 1024; + struct mem_state state; + char *usage = "[-v] [-W <warmup>] [-N <repetitions>][-M len[K|M]]\n"; + + state.line = sizeof(char*); + state.pagesize = getpagesize(); + + while (( c = getopt(ac, av, "avM:W:N:")) != EOF) { + switch(c) { + case 'v': + verbose = 1; + break; + case 'M': + maxlen = bytes(optarg); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if ((l = line_find(maxlen, warmup, repetitions, &state)) > 0) { + if (verbose) { + printf("cache line size: %d bytes\n", l); + } else { + printf("%d\n", l); + } + } + + return (0); +} diff --git a/performance/lmbench3/src/lmdd.1 b/performance/lmbench3/src/lmdd.1 new file mode 100644 index 0000000..a1e7f7e --- /dev/null +++ b/performance/lmbench3/src/lmdd.1 @@ -0,0 +1,131 @@ +.\" %W% %G% +.TH LMDD 1 +.SH NAME +lmdd \- move io for performance and debugging tests +.SH SYNOPSIS +.B lmdd +[ +.IB option = value +] .\|.\|. +.SH DESCRIPTION +.B lmdd +copies a specified input file to a specified output with possible +conversions. This program is primarily useful for timing I/O since it +prints out the timing statistics after completing. +.SH OPTIONS +.TP 15 +.BI if= name +Input file is taken from +.IR name ; +.I internal +is the default. +.I internal +is a special file that acts like Sun's +.IR /dev/zero , +i.e., it provides a buffer of zeros without doing a system call to get them. +.TP +.BI of= name +Output file is taken from +.IR name ; +.I internal +is the default. +.I internal +is a special file that acts like +.IR /dev/null , +without doing a system call to get rid of the data. +.TP +.BI bs= n +Input and output block size +.I n +bytes (default 8192). Note that this is different from dd(1), it has +a 512 byte default. Also note that the block size can be followed +by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024), +respectively. +.TP +.BI ipat= n +If +.B n +is non zero, expect a known pattern in the file (see opat). Mismatches +will be displayed as "ERROR: off=%d want=%x got=%x". The pattern is +a sequence of 4 byte integers with the first 0, second 1, and so on. +The default is not to check for the pattern. +.TP +.BI opat= n +If +.B n +is non zero, generate a known pattern on the output stream. Used for +debugging file system correctness. +The default is not to generate the pattern. +.TP +.BI mismatch= n +If +.B n +is non zero, stop at the first mismatched value. Used with ipat. +.TP +.BI skip= n +Skip +.IR n "" +input blocks before starting copy. +.TP +.BI fsync= n +If +.I n +is non-zero, call fsync(2) on the output file before exiting or printing +timing statistics. +.TP +.BI sync= n +If +.I n +is non-zero, call sync(2) before exiting or printing +timing statistics. +.TP +.BI rand= n +This argument, by default off, turns on random behavior. The argument is +not a flag, it is a size, that size is used as the upper bound for the +seeks. +Also note that the block size can be followed +by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024), +.TP +.BI flush= n +If +.I n +is non-zero and mmap(2) is available, call msync(2) to invalidate the +output file. This flushes the file to disk so that you don't have +unmount/mount. It is not as good as mount/unmount because it just +flushes file pages - it misses the indirect blocks which are still +cached. Not supported on all systems, compile time option. +.TP +.BI rusage= n +If +.I n +is non-zero, print rusage statistics as well as timing statistics. +Not supported on all systems, compile time option. +.TP +.BI count= n +Copy only +.IR n "" +input records. +.SH EXAMPLES +.LP +This is the most common usage, the intent is to measure disk performance. +The disk is a spare partition mounted on /spare. +.sp +.nf +.in +4 +# mount /spare +# lmdd if=internal of=/spare/XXX count=1000 fsync=1 +7.81 MB in 3.78 seconds (2.0676 MB/sec) + +: Flush cache +# umount /spare +# mount /spare + +# lmdd if=/spare/XXX of=internal +7.81 MB in 2.83 seconds (2.7611 MB/sec) +.in +.sp +.fi +.SH AUTHOR +Larry McVoy, lm@xxxxxxx +.br +Not copyrighted. diff --git a/performance/lmbench3/src/lmdd.c b/performance/lmbench3/src/lmdd.c new file mode 100644 index 0000000..419f03f --- /dev/null +++ b/performance/lmbench3/src/lmdd.c @@ -0,0 +1,893 @@ +char *id = "$Id: lmdd.c,v 1.23 1997/12/01 23:47:59 lm Exp $\n"; +/* + * defaults: + * bs=8k + * count=forever + * if=internal + * of=internal + * ipat=0 + * opat=0 + * mismatch=0 + * rusage=0 + * flush=0 + * rand=0 + * print=0 + * direct=0 + * rt=0 + * rtmax=0 + * wtmax=0 + * rtmin=0 + * wtmin=0 + * label="" + * shorthands: + * k, m, g are 2^10, 2^20, 2^30 multipliers. + * K, M, G are 10^3, 10^6, 10^9 multipliers. + * recognizes "internal" as an internal /dev/zero /dev/null file. + * + * Copyright (c) 1994-1998 by Larry McVoy. All rights reserved. + * See the file COPYING for the licensing terms. + * + * TODO - rewrite this entire thing from scratch. This is disgusting code. + */ + +#ifndef __Lynx__ +#define FLUSH +#endif + +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/time.h> +#include "bench.h" + +#undef ALIGN +#define ALIGN(x, bs) ((x + (bs - 1)) & ~(bs - 1)) + +#ifdef FLUSH +#include <sys/mman.h> +#include <sys/stat.h> +void flush(void); +#endif + +#define USE_VALLOC +#ifdef USE_VALLOC +#define VALLOC valloc +#else +#define VALLOC malloc +#endif + +#ifdef __sgi +# define LSEEK(a,b,c) (uint64)lseek64(a, (off64_t)b, c) +# define ATOL(s) atoll(s) +#else +# define LSEEK(a,b,c) (uint64)lseek(a, b, c) +# define ATOL(s) atol(s) +#endif + + +int awrite, poff, out, Print, Fsync, Sync, Flush, Bsize, ru; +uint64 Start, End, Rand, int_count; +int hash; +int Realtime, Notrunc; +int Rtmax, Rtmin, Wtmax, Wtmin; +int rthist[12]; /* histogram of read times */ +int wthist[12]; /* histogram of write times */ +char *Label; +uint64 *norepeat; +int norepeats = -1; +#ifdef USE_BDS + bds_msg *m1, *m2; +#endif + +uint64 getarg(); +int been_there(uint64 off); +int getfile(char *s, int ac, char **av); + +char *cmds[] = { + "bs", /* block size */ + "bufs", /* use this many buffers round robin */ + "count", /* number of blocks */ +#ifdef DBG + "debug", /* set external variable "dbg" */ +#endif +#ifdef O_DIRECT + "direct", /* direct I/O on input and output */ + "idirect", /* direct I/O on input */ + "odirect", /* direct I/O on output */ +#endif +#ifdef FLUSH + "flush", /* map in out and invalidate (flush) */ +#endif + "fork", /* fork to do write I/O */ + "fsync", /* fsync output before exit */ + "if", /* input file */ + "ipat", /* check input for pattern */ + "label", /* prefix print out with this */ + "mismatch", /* stop at first mismatch */ + "move", /* instead of count, limit transfer to this */ + "of", /* output file */ + "opat", /* generate pattern on output */ + "print", /* report type */ + "rand", /* do randoms over the specified size */ + /* must be power of two, not checked */ + "poff", /* Print the offsets as we do the io. */ +#ifdef RUSAGE + "rusage", /* dump rusage stats */ +#endif + "skip", /* skip this number of blocks */ + "sync", /* sync output before exit */ + "touch", /* touch each buffer after the I/O */ +#if !defined(hpux) + "usleep", /* sleep this many usecs between I/O */ +#endif + "hash", /* hash marks like FTP */ + "append", /* O_APPEND */ + "rtmax", /* read latency histogram max in mills */ + "wtmax", /* write latency histogram max in mills */ + "rtmin", /* read latency histogram max in mills */ + "wtmin", /* write latency histogram max in mills */ + "realtime", /* create files as XFS realtime files */ + "notrunc", /* overwrite rather than truncing out file */ + "end", /* limit randoms to this size near the + * Rand endpoints. */ + "start", /* Add this to Rand */ + "time", /* Run for this many seconds only. */ + "srand", /* Seed the random number generator */ + "padin", /* Pad an extra untimed block_size read */ +#ifdef USE_BDS + "awrite", /* use async writes and pipeline them. */ +#endif + "norepeat", /* don't ever do the same I/O twice */ +#ifdef sgi + "mpin", /* pin the buffer */ +#endif + "timeopen", /* include open time in results */ + "nocreate", /* just open for writing, don't create/trunc it */ +#ifdef O_SYNC + "osync", /* O_SYNC */ +#endif + 0, +}; + + +void error(char *); +void done(); +#ifdef DBG +extern int dbg; +#endif + +int +main(int ac, char **av) +{ + uint *buf; + uint *bufs[10]; + int nbufs, nextbuf = 0; + int Fork, misses, mismatch, outpat, inpat, in, timeopen, gotcnt; + int slp; + uint64 skip, size, count; + void chkarg(); + int i; + uint64 off = 0; + int touch; + int time; + int mills; + int pad_in; + int pid = 0; + struct timeval start_tv; + struct timeval stop_tv; + + if (sizeof(int) != 4) { + fprintf(stderr, "sizeof(int) != 4\n"); + exit(1); + } + for (i = 1; i < ac; ++i) { + chkarg(av[i]); + } + signal(SIGINT, done); + signal(SIGALRM, done); + misses = mismatch = getarg("mismatch=", ac, av); + inpat = getarg("ipat=", ac, av); + outpat = getarg("opat=", ac, av); + Bsize = getarg("bs=", ac, av); + if (Bsize < 0) + Bsize = 8192; +#if !defined(hpux) + slp = getarg("usleep=", ac, av); +#endif + Fork = getarg("fork=", ac, av); + Fsync = getarg("fsync=", ac, av); + Sync = getarg("sync=", ac, av); + Rand = getarg("rand=", ac, av); + Start = getarg("start=", ac, av); + End = getarg("end=", ac, av); + time = getarg("time=", ac, av); + if ((End != -1) && (Rand != -1) && (End > Rand)) { + End = Rand; + } + if (getarg("srand=", ac, av) != -1) { + srand48((long)getarg("srand=", ac, av)); + } + poff = getarg("poff=", ac, av) != -1; + Print = getarg("print=", ac, av); + nbufs = getarg("bufs=", ac, av); + Realtime = getarg("realtime=", ac, av); + Rtmax = getarg("rtmax=", ac, av); + if ((Rtmax != -1) && (Rtmax < 10)) + Rtmax = 10; + Rtmin = getarg("rtmin=", ac, av); + if ((Rtmax != -1) && (Rtmin == -1)) { + Rtmin = 0; + } + Wtmax = getarg("wtmax=", ac, av); + if ((Wtmax != -1) && (Wtmax < 10)) + Wtmax = 10; + Wtmin = getarg("wtmin=", ac, av); + if ((Wtmax != -1) && (Wtmin == -1)) { + Wtmin = 0; + } + if ((Rtmin && !Rtmax) || (Wtmin && !Wtmax)) { + fprintf(stderr, "Need a max to go with that min.\n"); + exit(1); + } + if ((Rtmin > Rtmax) || (Wtmin > Wtmax)) { + fprintf(stderr, + "min has to be less than max, R=%d,%d W=%d,%d\n", + Rtmax, Rtmin, Wtmax, Wtmin); + exit(1); + } + timeopen = getarg("timeopen=", ac, av); + pad_in = getarg("padin=", ac, av); + if (pad_in == -1) pad_in = 0; + + if (nbufs == -1) nbufs = 1; + if (nbufs > 10) { printf("Too many bufs\n"); exit(1); } +#ifdef DBG + dbg = getarg("debug=", ac, av) != -1; +#endif +#ifdef RUSAGE + ru = getarg("rusage=", ac, av); +#endif + touch = getarg("touch=", ac, av) != -1; + hash = getarg("hash=", ac, av) != (uint64)-1; + Label = (char *)getarg("label=", ac, av); + count = getarg("count=", ac, av); + size = getarg("move=", ac, av); + if (size != (uint64)-1) + count = size / Bsize; + if (Rand != -1) { + size = Rand - Bsize; + size = ALIGN(size, Bsize); + } + +#ifdef FLUSH + Flush = getarg("flush=", ac, av); +#endif + if (count == (uint64)-1) + gotcnt = 0; + else + gotcnt = 1; + int_count = 0; + skip = getarg("skip=", ac, av); + if (getarg("norepeat=", ac, av) != -1) { + if (gotcnt) { + norepeat = (uint64*)calloc(count, sizeof(uint64)); + } else { + norepeat = (uint64*)calloc(10<<10, sizeof(uint64)); + } + } + + if ((inpat != -1 || outpat != -1) && (Bsize & 3)) { + fprintf(stderr, "Block size 0x%x must be word aligned\n", Bsize); + exit(1); + } + if ((Bsize >> 2) == 0) { + fprintf(stderr, "Block size must be at least 4.\n"); + exit(1); + } + for (i = 0; i < nbufs; i++) { + if (!(bufs[i] = (uint *) VALLOC((unsigned) Bsize))) { + perror("VALLOC"); + exit(1); + } + bzero((char *) bufs[i], Bsize); +#ifdef sgi + if (getarg("mpin=", ac, av) != -1) { + if (mpin((void *)bufs[i], (size_t)Bsize)) { + perror("mpin for adam"); + } + } +#endif + } + + if (time != -1) { + alarm(time); + } + if (timeopen != -1) { + start(NULL); + } + in = getfile("if=", ac, av); + out = getfile("of=", ac, av); + if (timeopen == -1) { + start(NULL); + } + if ((Rtmax != -1) && in < 0) { + fprintf(stderr, "I think you wanted wtmax, not rtmax\n"); + exit(1); + } + if ((Wtmax != -1) && out < 0) { + fprintf(stderr, "I think you wanted rtmax, not wtmax\n"); + exit(1); + } + if (skip != (uint64)-1) { + off = skip; + off *= Bsize; + if (in >= 0) { + LSEEK(in, off, 0); + } + if (out >= 0) { + LSEEK(out, off, 0); + } + if (poff) { + fprintf(stderr, "%s ", p64sz(off)); + } + } + for (;;) { + register int moved; + + if (gotcnt && count-- <= 0) { + done(); + } + + /* + * If End is set, it means alternate back and forth + * between the end points of Rand, doing randoms within + * the area 0..End and Rand-End..Rand + */ + if (End != -1) { + static uint64 start = 0; + + start = start ? 0 : Rand - End; + do { + off = drand48() * End; + off = ALIGN(off, Bsize); + off += start; + if (Start != -1) { + off += Start; + } + } while (norepeat && been_there(off)); + if (norepeat) { + norepeat[norepeats++] = off; + if (!gotcnt && (norepeats == 10<<10)) { + norepeats = 0; + } + } + if (in >= 0) { + LSEEK(in, off, 0); + } + if (out >= 0) { + LSEEK(out, off, 0); + } + } + /* + * Set the seek pointer if doing randoms + */ + else if (Rand != -1) { + do { + off = drand48() * (size - Bsize); + if (Start != -1) { + off += Start; + } + off = ALIGN(off, Bsize); + } while (norepeat && been_there(off)); + if (norepeat) { + norepeat[norepeats++] = off; + } + if (!gotcnt && (norepeats == 10<<10)) { + norepeats = 0; + } + if (in >= 0) { + LSEEK(in, off, 0); + } + if (out >= 0) { + LSEEK(out, off, 0); + } + } + if (poff) { + fprintf(stderr, "%s ", p64sz(off)); + } + + buf = bufs[nextbuf]; + if (++nextbuf == nbufs) nextbuf = 0; + if (in >= 0) { + if ((Rtmax != -1) || (Rtmin != -1)) { + start(&start_tv); + } + moved = read(in, buf, Bsize); + + if (pad_in) { /* ignore this run, restart clock */ + pad_in = 0; + count++; + start(NULL); + continue; + } + + if ((Rtmax != -1) || (Rtmin != -1)) { + int mics = stop(&start_tv, &stop_tv); + + mills = mics / 1000; + if ((mills > Rtmax) || (mills < Rtmin)) { + fprintf(stderr, + "READ: %.02f milliseconds offset %s\n", + ((float)mics) / 1000, + p64sz(LSEEK(in, 0, SEEK_CUR))); + } + /* + * Put this read time in the histogram. + * The buckets are each 1/10th of Rtmax. + */ + if (mills >= Rtmax) { + rthist[11]++; + } else if (mills < Rtmin) { + rthist[0]++; + } else { + int step = (Rtmax - Rtmin) / 10; + int i; + + for (i = 1; i <= 10; ++i) { + if (mills < i * step + Rtmin) { + rthist[i]++; + break; + } + } + } + } + } else { + moved = Bsize; + } + if (moved == -1) { + perror("read"); + } + if (moved <= 0) { + done(); + } + if (inpat != -1) { + register int foo, cnt; + + for (foo = 0, cnt = moved/sizeof(int); cnt--; foo++) { + if (buf[foo] != (uint) (off + foo*sizeof(int))) { + fprintf(stderr, + "off=%u want=%x got=%x\n", + (uint)off, + (uint)(off + foo*sizeof(int)), + buf[foo]); + if (mismatch != -1 && --misses == 0) { + done(); + } + } + } + } + if ((in >= 0) && touch) { + int i; + + for (i = 0; i < moved; i += 4096) { + ((char *)buf)[i] = 0; + } + } + if (out >= 0) { + int moved2; + + if (Fork != -1) { + if (pid) { + waitpid(pid, 0, 0); + } + if ((pid = fork())) { + off += moved; + int_count += (moved >> 2); + continue; + } + } + if (outpat != -1) { + register int foo, cnt; + + for (foo = 0, cnt = moved/sizeof(int); + cnt--; foo++) { + buf[foo] = + (uint)(off + foo*sizeof(int)); + } + } + if ((Wtmax != -1) || (Wtmin != -1)) { + start(&start_tv); + } +#ifdef USE_BDS + /* + * The first time through, m1 & m2 are null. + * The Nth time through, we start the I/O into + * m2, and wait on m1, then switch. + */ + if (awrite) { + if (m1) { + m2 = bds_awrite(out, buf, moved); + moved2 = bds_adone(out, m1); + m1 = m2; + } else { + m1 = bds_awrite(out, buf, moved); + goto writedone; + } + } else { + moved2 = write(out, buf, moved); + } +#else + moved2 = write(out, buf, moved); +#endif + + if (moved2 == -1) { + perror("write"); + } + if (moved2 != moved) { + fprintf(stderr, "write: wanted=%d got=%d\n", + moved, moved2); + done(); + } + if ((Wtmax != -1) || (Wtmin != -1)) { + int mics = stop(&start_tv, &stop_tv); + + mills = mics / 1000; + if ((mills > Wtmax) || (mills < Wtmin)) { + fprintf(stderr, + "WRITE: %.02f milliseconds offset %s\n", + ((float)mics) / 1000, + p64sz(LSEEK(out, 0, SEEK_CUR))); + } + /* + * Put this write time in the histogram. + * The buckets are each 1/10th of Wtmax. + */ + if (mills >= Wtmax) { + wthist[11]++; + } else if (mills < Wtmin) { + wthist[0]++; + } else { + int step = (Wtmax - Wtmin) / 10; + int i; + + for (i = 1; i <= 10; ++i) { + if (mills < i * step + Wtmin) { + wthist[i]++; + break; + } + } + } + } + + if (moved2 == -1) { + perror("write"); + } + if (moved2 != moved) { + done(); + } + + if (touch) { + int i; + + for (i = 0; i < moved; i += 4096) { + ((char *)buf)[i] = 0; + } + } + } +#ifdef USE_BDS +writedone: /* for the first async write */ +#endif + off += moved; + int_count += (moved >> 2); +#if !defined(hpux) + if (slp != -1) { + usleep(slp); + } +#endif + if (hash) { + fprintf(stderr, "#"); + } + if (Fork != -1) { + exit(0); + } + } +} + +int +been_there(uint64 off) +{ + register int i; + + for (i = 0; i <= norepeats; ++i) { + if (off == norepeat[i]) { + fprintf(stderr, "norepeat on %u\n", (uint)off); + return (1); + } + } + return (0); +} + +void +chkarg(char *arg) +{ + int i; + char *a, *b; + + for (i = 0; cmds[i]; ++i) { + for (a = arg, b = cmds[i]; *a && *b && *a == *b; a++, b++) + ; + if (*a == '=') + return; + } + fprintf(stderr, "Bad arg: %s, possible arguments are: ", arg); + for (i = 0; cmds[i]; ++i) { + fprintf(stderr, "%s ", cmds[i]); + } + fprintf(stderr, "\n"); + exit(1); + /*NOTREACHED*/ +} + +void +done(void) +{ + int i; + int step; + int size; + +#ifdef USE_BDS + if (awrite && m1) { + bds_adone(out, m1); + } +#endif + if (Sync > 0) + sync(); + if (Fsync > 0) + fsync(out); +#ifdef FLUSH + if (Flush > 0) + flush(); +#endif + stop(NULL, NULL); +#ifdef RUSAGE + if (ru != -1) + rusage(); +#endif + if (hash || poff) { + fprintf(stderr, "\n"); + } + if ((long)Label != -1) { + fprintf(stderr, "%s", Label); + } + int_count <<= 2; + switch (Print) { + case 0: /* no print out */ + break; + + case 1: /* latency type print out */ + latency((uint64)(int_count / Bsize), (uint64)Bsize); + break; + + case 2: /* microsecond per op print out */ + micro("", (uint64)(int_count / Bsize)); + break; + + case 3: /* kb / sec print out */ + kb(int_count); + break; + + case 4: /* mb / sec print out */ + mb(int_count); + break; + + case 5: /* Xgraph output */ + bandwidth(int_count, 1, 0); + break; + + default: /* bandwidth print out */ + bandwidth(int_count, 1, 1); + break; + } + if (Rtmax != -1) { + printf("READ operation latencies\n"); + step = (Rtmax - Rtmin) / 10; + if (rthist[0]) { + printf("%d- ms: %d\n", Rtmin, rthist[0]); + } + for (i = 1, size = Rtmin; i <= 10; i++, size += step) { + if (!rthist[i]) + continue; + printf("%d to %d ms: %d\n", + size, size + step - 1, rthist[i]); + } + if (rthist[11]) { + printf("%d+ ms: %d\n", Rtmax, rthist[11]); + } + } + if (Wtmax != -1) { + printf("WRITE operation latencies\n"); + step = (Wtmax - Wtmin) / 10; + if (wthist[0]) { + printf("%d- ms: %d\n", Wtmin, wthist[0]); + } + for (i = 1, size = Wtmin; i <= 10; i++, size += step) { + if (!wthist[i]) + continue; + printf("%d to %d ms: %d\n", + size, size + step - 1, wthist[i]); + } + if (wthist[11]) { + printf("%d+ ms: %d\n", Wtmax, wthist[11]); + } + } + exit(0); +} + +uint64 +getarg(char *s, int ac, char **av) +{ + register uint64 len, i; + + len = strlen(s); + + for (i = 1; i < ac; ++i) { + if (!strncmp(av[i], s, len)) { + register uint64 bs = ATOL(&av[i][len]); + + switch (av[i][strlen(av[i]) - 1]) { + case 'K': bs *= 1000; break; + case 'k': bs <<= 10; break; + case 'M': bs *= 1000000; break; + case 'm': bs <<= 20; break; + case 'G': bs *= 1000000000L; break; + case 'g': bs <<= 30; break; + } + + if (!strncmp(av[i], "label", 5)) { + return (uint64)(long)(&av[i][len]); /* HACK */ + } + if (!strncmp(av[i], "bs=", 3)) { + return (uint64)(bs); + } + return (bs); + } + } + return ((uint64)-1); +} + +char *output; + +int +getfile(char *s, int ac, char **av) +{ + register int ret, len, i; + int append = getarg("append=", ac, av) != -1; + int notrunc = getarg("notrunc=", ac, av) != -1; + int nocreate = getarg("nocreate=", ac, av) != -1; +#ifdef O_SYNC + int osync = getarg("osync=", ac, av) != -1; +#endif + int oflags; + + len = strlen(s); + + for (i = 1; i < ac; ++i) { + if (!strncmp(av[i], s, len)) { + if (av[i][0] == 'o') { + if (!strcmp("of=internal", av[i])) + return (-2); + if (!strcmp("of=stdout", av[i])) + return (1); + if (!strcmp("of=1", av[i])) + return (1); + if (!strcmp("of=-", av[i])) + return (1); + if (!strcmp("of=stderr", av[i])) + return (2); + if (!strcmp("of=2", av[i])) + return (2); + oflags = O_WRONLY; + oflags |= (notrunc || append) ? 0 : O_TRUNC; + oflags |= nocreate ? 0 : O_CREAT; + oflags |= append ? O_APPEND : 0; +#ifdef O_SYNC + oflags |= osync ? O_SYNC : 0; +#endif + ret = open(&av[i][len], oflags,0644); +#ifdef O_DIRECT + if ((getarg("odirect=", ac, av) != -1) || + (getarg("direct=", ac, av) != -1)) { + close(ret); + ret = open(&av[i][len], oflags|O_DIRECT); + awrite = + getarg("awrite=", ac, av) != -1; + } +#endif + if (ret == -1) + error(&av[i][len]); +#ifdef F_FSSETXATTR + if (Realtime == 1) { + struct fsxattr fsxattr; + + bzero(&fsxattr,sizeof(struct fsxattr)); + fsxattr.fsx_xflags = 0x1; + if (fcntl(ret,F_FSSETXATTR,&fsxattr)){ + printf("WARNING: Could not make %s a real time file\n", + &av[i][len]); + } + } +#endif + output = &av[i][len]; + return (ret); + } else { + if (!strcmp("if=internal", av[i])) + return (-2); + if (!strcmp("if=stdin", av[i])) + return (0); + if (!strcmp("if=0", av[i])) + return (0); + if (!strcmp("if=-", av[i])) + return (0); + ret = open(&av[i][len], 0); +#ifdef O_DIRECT + if ((getarg("idirect=", ac, av) != -1) || + (getarg("direct=", ac, av) != -1)) { + close(ret); + ret = open(&av[i][len], O_RDONLY|O_DIRECT); + } +#endif + if (ret == -1) + error(&av[i][len]); + return (ret); + } + } + } + return (-2); +} + +#ifdef FLUSH +int +warning(char *s) +{ + if ((long)Label != -1) { + fprintf(stderr, "%s: ", Label); + } + perror(s); + return (-1); +} + +void +flush(void) +{ + int fd; + struct stat sb; + caddr_t where; + + if (output == NULL || (fd = open(output, 2)) == -1) { + warning("No output file"); + return; + } + if (fstat(fd, &sb) == -1 || sb.st_size == 0) { + warning(output); + return; + } + where = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + msync(where, sb.st_size, MS_INVALIDATE); + munmap(where, sb.st_size); +} +#endif + +void +error(char *s) +{ + if ((long)Label != -1) { + fprintf(stderr, "%s: ", Label); + } + perror(s); + exit(1); +} diff --git a/performance/lmbench3/src/lmhttp.c b/performance/lmbench3/src/lmhttp.c new file mode 100644 index 0000000..00bd4b0 --- /dev/null +++ b/performance/lmbench3/src/lmhttp.c @@ -0,0 +1,397 @@ +/* + * http_srv.c - simple HTTP "server" + * + * Only implements the simplest GET operation. + * + * usage: http_srv [-f#] [-l] [-d] [port] + * + * Copyright (c) 1994-6 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Other authors: Steve Alexander, sca@xxxxxxx. + */ +char *id = "$Id$\n"; + +#include "bench.h" +#ifdef MAP_FILE +# define MMAP_FLAGS MAP_FILE|MAP_SHARED +#else +# define MMAP_FLAGS MAP_SHARED +#endif +#define MMAPS_BETTER (4<<10) /* mmap is faster for sizes >= this */ +#define LOGFILE "/usr/tmp/lmhttp.log" + +char *buf; +char *bufs[3]; +int Dflg, dflg, nflg, lflg, fflg, zflg; +int data, logfile; +void die(); +void worker(); +char *http_time(void); +char *date(time_t *tt); +char *type(char *name); +int source(int sock); +int isdir(char *name); +void dodir(char *name, int sock); +void fake(int sock, char *buf, int size); +void rdwr(int fd, int sock, char *buf); +int mmap_rdwr(int from, int to, int size); +void logit(int sock, char *name, int size); + + +int +main(int ac, char **av) +{ + int i, prog; +#ifdef sgi + int ncpus = sysmp(MP_NPROCS); +#endif + + for (i = 1; i < ac; ++i) { + if (av[i][0] != '-') { + break; + } + switch (av[i][1]) { + case 'D': Dflg = 1; break; /* Allow directories */ + case 'd': dflg = 1; break; /* debugging */ + case 'f': fflg = atoi(&av[i][2]); + break; /* # of threads */ + case 'l': lflg = 1; break; /* logging */ + case 'n': nflg = 1; break; /* fake file i/o */ + case 'z': zflg = 1; break; /* all files are 0 size */ + default: + fprintf(stderr, "Barf.\n"); + exit(1); + } + } + if (getenv("DOCROOT")) { + if (chdir(getenv("DOCROOT")) == -1) { + perror(getenv("DOCROOT")); + exit(1); + } + } + if (atoi(av[ac - 1]) != 0) { + prog = -atoi(av[ac - 1]); + } else { + prog = -80; + } + /* + * Steve - why is this here? + */ + signal(SIGPIPE, SIG_IGN); + data = tcp_server(prog, SOCKOPT_REUSE); + bufs[0] = valloc(XFERSIZE); + bufs[1] = valloc(XFERSIZE); + bufs[2] = valloc(XFERSIZE); + logfile = open(LOGFILE, O_CREAT|O_APPEND|O_WRONLY, 0666); + signal(SIGINT, die); + signal(SIGHUP, die); + signal(SIGTERM, die); + for (i = 1; i < fflg; ++i) { + if (fork() <= 0) { + break; + } + } + handle_scheduler(i, 0, 0); + worker(); + return(0); +} + +void +worker() +{ + int newdata; + int next = 0; + + for (;;) { + buf = bufs[next]; + if (++next == 3) next = 0; + newdata = tcp_accept(data, SOCKOPT_REUSE); + source(newdata); + close(newdata); + } +} + +/* + * "Tue, 28 Jan 97 01:20:30 GMT"; + * 012345678901234567890123456 + */ +char *http_time() +{ + time_t tt; + static time_t save_tt; + struct tm *t; + static struct tm save_tm; + static char buf[100]; + + time(&tt); /* costs 10 usecs */ + if (tt == save_tt) { + return (buf); + } + save_tt = tt; + t = gmtime(&tt); /* costs 21 usecs */ + if (buf[0] && (tt - save_tt < 3600)) { + buf[22] = t->tm_sec / 10 + '0'; + buf[21] = t->tm_sec % 10 + '0'; + save_tm.tm_sec = t->tm_sec; + if (save_tm.tm_min == t->tm_min) { + return (buf); + } + } + save_tm = *t; + /* costs 120 usecs */ + strftime(buf, sizeof(buf), "%a, %d %b %y %H:%M:%S %Z", t); + return(buf); +} + +/* + * Input: dates that are probably within the last year. + * Output: Tue, 28 Jan 97 01:20:30 GMT + * + * Since it costs 150 usecs or so to do this on an Indy, it may pay to + * optimize this. + */ +char * +date(time_t *tt) +{ + return "Tue, 28 Jan 97 01:20:30 GMT"; +} + +char * +type(char *name) +{ + int len = strlen(name); + + if (!strcmp(&name[len - 4], ".gif")) { + return "image/gif"; + } + if (!strcmp(&name[len - 5], ".jpeg")) { + return "image/jpeg"; + } + if (!strcmp(&name[len - 5], ".html")) { + return "text/html"; + } + if (Dflg && isdir(name)) { + return "text/html"; + } + return "text/plain"; +} + +/* + * Read the file to be transfered. + * Write that file on the data socket. + * The caller closes the socket. + */ +int +source(int sock) +{ + int fd, n, size; + char *s; + char file[100]; + char hbuf[1024]; + struct stat sb; +#define name &buf[5] + + n = read(sock, buf, XFERSIZE); + if (n <= 0) { + perror("control nbytes"); + return (-1); + } + buf[n] = 0; + if (dflg) printf("%.*s\n", n, buf); + if (zflg) { + return (0); + } + if (!strncmp(buf, "EXIT", 4)) { + exit(0); + } + if (strncmp(buf, "GET /", 5)) { + perror(buf); + return(1); + } + for (s = buf; *s && *s != '\r' && *s != '\n'; s++) + ; + *s = 0; + for (s = name; *s && *s != ' '; s++) + ; + *s = 0; + if (lflg) strncpy(file, name, sizeof(file)); + if (dflg) printf("OPEN %s\n", name); + fd = open(name, 0); + if (fd == -1) { +error: perror(name); + close(fd); + return (1); + } + if (fstat(fd, &sb) == -1) { + if (dflg) printf("Couldn't stat %s\n", name); + goto error; + } + size = sb.st_size; + n = sprintf(hbuf, "HTTP/1.0 200 OK\r\n%s\r\nServer: lmhttp/0.1\r\nContent-Type: %s\r\nLast-Modified: %s\r\n\r\n", + http_time(), type(name), date(&sb.st_mtime)); + if (write(sock, hbuf, n) != n) { + goto error; + } + if (Dflg && isdir(name)) { + dodir(name, sock); + } else if (nflg) { + fake(sock, buf, size); + } else if ((size > MMAPS_BETTER)) { /* XXX */ + if (mmap_rdwr(fd, sock, size) == -1) { + printf("%s mmap failed\n", name); + } + } else { + rdwr(fd, sock, buf); + } + if (lflg) logit(sock, file, size); + close(fd); + return(0); +} +#undef name + + +int +isdir(char *name) +{ + struct stat sb; + if (stat(name, &sb) == -1) { + return(0); + } + return (S_ISDIR(sb.st_mode)); +} + +#ifdef example +<HTML><HEAD> +<TITLE>Index of /pub/Linux</TITLE> +</HEAD><BODY> +<H1>Index of /pub/Linux</H1> +<PRE><IMG SRC="/icons/blank.gif" ALT=" "> Name Last modified Size Description +<HR> +<IMG SRC="/icons/unknown.gif" ALT="[ ]"> <A HREF="!INDEX">!INDEX</A> 19-Sep-97 03:20 3k +<IMG SRC="/icons/text.gif" ALT="[TXT]"> <A HREF="!INDEX.html">!INDEX.html</A> 19-Sep-97 03:20 6k +#endif + +void +dodir(char *name, int sock) +{ + FILE *p; + char buf[1024]; + char path[1024]; + + if (dflg) printf("dodir(%s)\n", name); + sprintf(buf, "cd %s && ls -1a", name); + p = popen(buf, "r"); + if (!p && dflg) printf("Couldn't popen %s\n", buf); + sprintf(buf, "\ +<HTML><HEAD>\n<TITLE>Index of /%s</TITLE></HEAD><BODY><H1>Index of /%s</H1>\n", + name, name); + write(sock, buf, strlen(buf)); + while (fgets(buf, sizeof(buf), p)) { + buf[strlen(buf) - 1] = 0; + sprintf(path, "/%s/%s", name, buf); + if (dflg) printf("\t%s\n", path); + write(sock, "<A HREF=\"", 9); + write(sock, path, strlen(path)); + write(sock, "\">", 2); + write(sock, buf, strlen(buf)); + write(sock, "</A><BR>\n", 9); + } + pclose(p); +} + +void +fake(int sock, char *buf, int size) +{ + int n; + + while (size > 0) { + n = write(sock, buf, size > XFERSIZE ? XFERSIZE : size); + if (n == -1) { + perror("write on socket"); + return; + } + size -= n; + } +} + +void +rdwr(int fd, int sock, char *buf) +{ + int nread; + + while ((nread = read(fd, buf, XFERSIZE)) > 0) { + int i; + + for (i = 0; i < nread; ) { + int nwrote = write(sock, buf, nread - i); + + if (i < 0) { + exit(1); + } + i += nwrote; + } + } +} + +int +mmap_rdwr(int from, int to, int size) +{ + char *buf; + int done = 0, wrote; + + buf = mmap(0, size, PROT_READ, MMAP_FLAGS, from, 0); + if ((long)buf == -1) { + perror("mmap"); + return (-1); + } + do { + wrote = write(to, buf + done, size - done); + if (wrote == -1) { + perror("write"); + break; + } + done += wrote; + } while (done < size); + if (munmap(buf, size) == -1) { + perror("unmap"); + } + return (0); +} + +static char logbuf[64<<10]; /* buffer into here */ +static int nbytes; /* bytes buffered */ + +/* + * HTTP server logging, compressed format. + */ +void +logit(int sock, char *name, int size) +{ + struct sockaddr_in sin; + int len = sizeof(sin); + char buf[1024 + 16]; /* maxpathlen + others */ + + if (getpeername(sock, (struct sockaddr*)&sin, &len) == -1) { + perror("getpeername"); + return; + } + len = sprintf(buf, "%u %u %s %u\n", + *((unsigned int*)&sin.sin_addr), (unsigned int)time(0), name, size); + if (nbytes + len >= sizeof(logbuf)) { + write(logfile, logbuf, nbytes); + nbytes = 0; + } + bcopy(buf, &logbuf[nbytes], len); + nbytes += len; +} + +void die() +{ + if (nbytes) { + write(logfile, logbuf, nbytes); + nbytes = 0; + } + exit(1); +} diff --git a/performance/lmbench3/src/loop_o.c b/performance/lmbench3/src/loop_o.c new file mode 100644 index 0000000..1cc4333 --- /dev/null +++ b/performance/lmbench3/src/loop_o.c @@ -0,0 +1,8 @@ +#include "bench.h" + +int +main() +{ + printf("%.8f\n", l_overhead()); + return (0); +} diff --git a/performance/lmbench3/src/memsize.c b/performance/lmbench3/src/memsize.c new file mode 100644 index 0000000..e1d05be --- /dev/null +++ b/performance/lmbench3/src/memsize.c @@ -0,0 +1,192 @@ +/* + * memsize.c - figure out how much memory we have to use. + * + * Usage: memsize [max_wanted_in_MB] + * + * Copyright (c) 1995 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +#define CHK(x) if ((x) == -1) { perror("x"); exit(1); } + +#ifndef TOO_LONG +#define TOO_LONG 10 /* usecs */ +#endif + +int alarm_triggered = 0; + +void timeit(char *where, size_t size); +static void touchRange(char *p, size_t range, ssize_t stride); +int test_malloc(size_t size); +void set_alarm(uint64 usecs); +void clear_alarm(); + +int +main(int ac, char **av) +{ + char *where; + char *tmp; + size_t size = 0; + size_t max = 0; + size_t delta; + + if (ac == 2) { + max = size = bytes(av[1]) * 1024 * 1024; + } + if (max < 1024 * 1024) { + max = size = 1024 * 1024 * 1024; + } + /* + * Binary search down and then binary search up + */ + for (where = 0; !test_malloc(size); size >>= 1) { + max = size; + } + /* delta = size / (2 * 1024 * 1024) */ + for (delta = (size >> 21); delta > 0; delta >>= 1) { + uint64 sz = (uint64)size + (uint64)delta * 1024 * 1024; + if (max < sz) continue; + if (test_malloc(sz)) size = sz; + } + if (where = malloc(size)) { + timeit(where, size); + free(where); + } + exit (0); +} + +void +timeit(char *where, size_t size) +{ + int sum = 0; + char *end = where + size; + size_t n; + size_t s; + size_t range; + size_t incr = 1024 * 1024; + ssize_t stride; + size_t pagesize = getpagesize(); + + if (size < 1024*1024 - 16*1024) { + fprintf(stderr, "Bad size\n"); + return; + } + + range = 1024 * 1024; + incr = 1024 * 1024; + touchRange(where, range, pagesize); + for (range += incr; range <= size; range += incr) { + n = range / pagesize; + set_alarm(n * TOO_LONG); + touchRange(where + range - incr, incr, pagesize); + clear_alarm(); + set_alarm(n * TOO_LONG); + start(0); + touchRange(where, range, pagesize); + sum = stop(0, 0); + clear_alarm(); + if ((sum / n) > TOO_LONG || alarm_triggered) { + size = range - incr; + break; + } + for (s = 8 * 1024 * 1024; s <= range; s *= 2) + ; + incr = s / 8; + if (range < size && size < range + incr) { + incr = size - range; + } + fprintf(stderr, "%dMB OK\r", range/(1024*1024)); + } + fprintf(stderr, "\n"); + printf("%d\n", (size>>20)); +} + +static void +touchRange(char *p, size_t range, ssize_t stride) +{ + register char *tmp = p + (stride > 0 ? 0 : range - 1); + register size_t delta = (stride > 0 ? stride : -stride); + + while (range > delta - 1 && !alarm_triggered) { + *tmp = 0; + tmp += stride; + range -= delta; + } +} + +int +test_malloc(size_t size) +{ + int fid[2]; + int result; + int status; + void* p; + + if (pipe(fid) < 0) { + void* p = malloc(size); + if (!p) return 0; + free(p); + return 1; + } + if (fork() == 0) { + close(fid[0]); + p = malloc(size); + result = (p ? 1 : 0); + write(fid[1], &result, sizeof(int)); + close(fid[1]); + if (p) free(p); + exit(0); + } + close(fid[1]); + if (read(fid[0], &result, sizeof(int)) != sizeof(int)) + result = 0; + close(fid[0]); + wait(&status); + return result; +} + +void +gotalarm() +{ + alarm_triggered = 1; +} + +void +set_alarm(uint64 usecs) +{ + struct itimerval value; + struct sigaction sa; + + alarm_triggered = 0; + + sa.sa_handler = gotalarm; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sigaction(SIGALRM, &sa, 0); + + value.it_interval.tv_sec = 0; + value.it_interval.tv_usec = 0; + value.it_value.tv_sec = usecs / 1000000; + value.it_value.tv_usec = usecs % 1000000; + + setitimer(ITIMER_REAL, &value, NULL); +} + +void +clear_alarm() +{ + struct itimerval value; + + value.it_interval.tv_sec = 0; + value.it_interval.tv_usec = 0; + value.it_value.tv_sec = 0; + value.it_value.tv_usec = 0; + + setitimer(ITIMER_REAL, &value, NULL); +} + diff --git a/performance/lmbench3/src/mhz.c b/performance/lmbench3/src/mhz.c new file mode 100644 index 0000000..210f6fc --- /dev/null +++ b/performance/lmbench3/src/mhz.c @@ -0,0 +1,507 @@ +/* + * mhz.c - calculate clock rate and megahertz + * + * Usage: mhz [-c] + * + ******************************************************************* + * + * Caveat emptor and other warnings + * + * This code must be compiled using the optimizer! If you don't + * compile this using the optimizer, then many compilers don't + * make good use of the registers and your inner loops end up + * using stack variables, which is SLOW. + * + * Also, it is sensitive to other processor load. When running + * mhz with "rtprio" (real-time priority), I have never had mhz + * make a mistake on my machine. At other times mhz has been + * wrong about 10% of the time. + * + * If there is too much noise/error in the data, then this program + * will usually return a clock speed that is too high. + * + ******************************************************************* + * + * Constraints + * + * mhz.c is meant to be platform independent ANSI/C code, and it + * has as little platform dependent code as possible. + * + * This version of mhz is designed to eliminate the variable + * instruction counts used by different compilers on different + * architectures and instruction sets. It is also structured to + * be tightly interlocked so processors with super-scalar elements + * or dynamic instructure reorder buffers cannot overlap the + * execution of the expressions. + * + * We have to try and make sure that the code in the various + * inner loops does not fall out of the on-chip instruction cache + * and that the inner loop variables fit inside the register set. + * The i386 only has six addressable registers, so we had to make + * sure that the inner loop procedures had fewer variables so they + * would not spill onto the stack. + * + ******************************************************************* + * + * Algorithm + * + * We can compute the CPU cycle time if we can get the compiler + * to generate (at least) two instruction sequences inside loops + * where the inner loop instruction counts are relatively prime. + * We have several different loops to increase the chance that + * two of them will be relatively prime on any given architecture. + * + * This technique makes no assumptions about the cost of any single + * instruction or the number of instructions used to implement a + * given expression. We just hope that the compiler gets at least + * two inner loop instruction sequences with lengths that are + * relatively prime. The "relatively prime" makes the greatest + * common divisor method work. If all the instructions sequences + * have a common factor (e.g. 2), then the apparent CPU speed will + * be off by that common factor. Also, if there is too much + * variability in the data so there is no apparent least common + * multiple within the error bounds set in multiple_approx, then + * we simply return the maximum clock rate found in the loops. + * + * The processor's clock speed is the greatest common divisor + * of the execution frequencies of the various loops. For + * example, suppose we are trying to compute the clock speed + * for a 120Mhz processor, and we have two loops: + * SHR --- two cycles to shift right + * SHR;ADD --- three cycles to SHR and add + * then the expression duration will be: + * SHR 11.1ns (2 cycles/SHR) + * SHR;ADD 16.6ns (3 cycles/SHR;ADD) + * so the greatest common divisor is 5.55ns and the clock speed + * is 120Mhz. Aside from extraneous variability added by poor + * benchmarking hygiene, this method should always work when we + * are able to get loops with cycle counts that are relatively + * prime. + * + * Suppose we are unlucky, and we have our two loops do + * not have relatively prime instruction counts. Suppose + * our two loops are: + * SHR 11.1ns (2 cycles/SHR) + * SHR;ADD;SUB 22.2ns (4 cycles/SHR;ADD;SUB) + * then the greatest common divisor will be 11.1ns, so the clock + * speed will appear to be 60Mhz. + * + * The loops provided so far should have at least two relatively + * prime loops on nearly all architectures. + * + ******************************************************************* + * + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Silicon Graphics is gratefully acknowledged. + * Support for this development by Hewlett Packard is gratefully acknowledged. + * Support for this development by Sun Microsystems is gratefully acknowledged. + * + ******************************************************************* + */ +char *id = "$Id$\n"; + +#include "bench.h" +#include <math.h> + +typedef long TYPE; + +#define TEN(A) A A A A A A A A A A +#define HUNDRED(A) TEN(A) TEN(A) TEN(A) TEN(A) TEN(A) \ + TEN(A) TEN(A) TEN(A) TEN(A) TEN(A) + +#define MHZ(M, contents) \ +char* \ +name_##M() \ +{ \ + return #contents; \ +} \ + \ +TYPE** \ +_mhz_##M (register long n, register TYPE **p, \ + register TYPE a, register TYPE b) \ +{ \ + for (; n > 0; --n) { \ + HUNDRED(contents) \ + } \ + return p + a + b; \ +} \ + \ +void \ +mhz_##M(int enough) \ +{ \ + TYPE __i = 1; \ + TYPE *__x=(TYPE *)&__x, **__p=(TYPE **)__x, **__q = NULL; \ + _mhz_##M(1, __p, 1, 1); \ + BENCH1(__q = _mhz_##M(__n, __p, __i, __i); __n = 1;, enough) \ + use_pointer((void*)__q); \ + save_n(100 * get_n()); /* # of expressions executed */ \ +} + +MHZ(1, p=(TYPE**)*p;) +MHZ(2, a^=a+a;) +MHZ(3, a^=a+a+a;) +MHZ(4, a>>=b;) +MHZ(5, a>>=a+a;) +MHZ(6, a^=a<<b;) +MHZ(7, a^=a+b;) +MHZ(8, a+=(a+b)&07;) +MHZ(9, a^=n;b^=a;a|=b;) + +typedef void (*loop_f)(int); +loop_f loops[] = { + mhz_1, + mhz_2, + mhz_3, + mhz_4, + mhz_5, + mhz_6, + mhz_7, + mhz_8, + mhz_9, +}; + + +#define NTESTS (sizeof(loops) / sizeof(loop_f)) +#define BIT_SET(A,bit) ((A) & 1 << (bit)) + + +/* + * This is used to filter out bad points (mostly ones that have had + * their inner loop optimized away). Bad points are those with values + * less than 1/20th of the median value and more than 20 times the + * median value. + * + * filter_data returns the number of valid data points, and puts the + * valid points in the lower part of the values[] array. + */ +int +filter_data(double values[], int size) +{ + int i; + int tests; + double median; + double *d = (double *)malloc((size + 1) * sizeof(double)); + + for (i = 0; i < size; ++i) d[i] = values[i]; + qsort(d, size, sizeof(double), double_compare); + + median = d[size/2]; + if (size > 0 && size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0; + + free(d); + + /* if the data point is inside the envelope of acceptable + * results, then keep it, otherwise discard it + */ + for (i = 0, tests = 0; i < size; ++i) + if (0.05 * median < values[i] && values[i] < 20.0 * median) { + if (i > tests) values[tests] = values[i]; + tests++; + } + + return tests; +} + +/* + * make sure that there are enough points with significantly + * different data values (greater than 5% difference) in the + * data subset. + */ +int +classes(double values[], int size) +{ + int i; + double median; + double *d = (double *)malloc(size * sizeof(double)); + int classid; + + for (i = 0; i < size; ++i) d[i] = values[i]; + qsort(d, size, sizeof(double), double_compare); + + median = d[size/2]; + if (size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0; + + /* if the difference is less than 1/20th of the median, then + * we assume that the two points are the same + */ + for (i = 1, classid = 1; i < size; ++i) + if ((d[i] - d[i-1]) > 0.05 * median) classid++; + + free(d); + return classid; +} + +/* + * mode + * + * return the most common value (within 1MHz) + */ +int +mode(double values[], int n) +{ + int i, n_mode, n_curr; + int mode, curr; + + qsort(values, n, sizeof(double), double_compare); + + n_mode = 1; + n_curr = 1; + mode = (int)(values[0] + 0.5); + curr = (int)(values[0] + 0.5); + + for (i = 1; i < n; ++i) { + int v = (int)(values[i] + 0.5); + if (curr != v) { + curr = v; + n_curr = 0; + } + n_curr++; + if (n_curr > n_mode) { + mode = curr; + n_mode = n_curr; + } + } + + return mode; +} + +/* + * cross_values + * + * This routine will create new data points by subtracting pairs + * of data points. + */ +void +cross_values(double values[], int size, double **cvalues, int *csize) +{ + int i, j; + + *cvalues = (double *)malloc(size * size * sizeof(double)); + *csize = 0; + + for (i = 0; i < size; ++i) { + (*cvalues)[(*csize)++] = values[i]; + /* create new points with the differences */ + for (j = i + 1; j < size; ++j) { + (*cvalues)[(*csize)++] = ABS(values[i] - values[j]); + } + } +} + + +/* + * gcd + * + * return the greatest common divisor of the passed values (within a + * margin of error because these are experimental results, not + * theoretical numbers). We do this by guessing how many instructions + * are in each loop, and then trying to fit a straight line through + * the (instruction count, time) points. The regression is of the + * form: + * + * y = a + b * x + * + * The time for an individual instruction is "b", while "a" should + * be 0. The trick is to figure out which guess is the right one! + * + * We assume that the gcd is the first value at which we have + * significantly improved regression fit (as measured by chi2). + * + * We increase the number of experimental points (and generate + * more small points) by adding points for the differences between + * measured values (and compute the standard error appropriately). + * + * We want the regression line to go through the origin, so we + * add an artificial point at (0,0) with a tiny standard error. + */ +double +gcd(double values[], int size) +{ +/* assumption: shortest inner loop has no more than this many instructions */ +#define MAX_COUNT 6 + int i, n, count; + double min, result, min_chi2 = 0.0, a, b, sig_a, sig_b, chi2; + double *y, *x = (double *)malloc(size * size * sizeof(double)); + + /* find the smallest value */ + result = min = double_min(values, size); + + /* create new points by subtracting each pair of values */ + cross_values(values, size, &y, &n); + + /* make sure the regression goes through the origin */ + y[n++] = 0.0; + + for (count = 1; count < MAX_COUNT; ++count) { + /* + * given the minimum loop has "count" instructions, + * guess how many instructions each other loop contains + */ + for (i = 0; i < n; ++i) { + int m = (int)((double)count * y[i] / min + 0.5); + x[i] = (double)m; + } + + /* find the regression of the samples */ + regression(x, y, NULL, n, &a, &b, &sig_a, &sig_b, &chi2); + + if (count == 1 || count * count * chi2 < min_chi2) { + result = b; + min_chi2 = chi2; + } + } + free(x); + free(y); + return result; +} + +/* + * compute the gcd of many possible combinations of experimental values + * and return the mode of the results to reduce the impact + * of a few bad experimental measurements on the computed result. + * + * r - pointer to the array of experimental results + * off - offset of the result we want. TRIES-1 == minimum result. + */ +int +compute_mhz(result_t *r) +{ + int i, j, mhz[2], n, subset, ntests; + double data[NTESTS], results[1<<NTESTS]; + + for (i = 0; i < 2; ++i) { + for (subset = 0, ntests = 0; subset < (1<<NTESTS); ++subset) { + for (j = 0, n = 0; j < NTESTS; ++j) + if (BIT_SET(subset, j) && r[j].N > TRIES/2) + data[n++] = r[j].v[r[j].N-1-i].u / (double)r[j].v[r[j].N-1-i].n; + if (n < 2 + || (n = filter_data(data, n)) < 2 + ||classes(data, n) < 2) + continue; + results[ntests++] = 1.0 / gcd(data, n); + } + mhz[i] = mode(results, ntests); + } + /* if the results agree within 1% or 1MHz, accept them */ + if (ABS(mhz[0] - mhz[1]) / (double)mhz[0] <= 0.01 + || ABS(mhz[0] - mhz[1]) <= 1) + return mhz[0]; + + return -1; +} + +void +save_data(result_t* data, result_t* data_save) +{ + int i; + + for (i = 0; i < NTESTS; ++i) { + data_save[i] = data[i]; + } +} + +void +print_data(double mhz, result_t* data) +{ + int i, j; + char *CPU_name = "CPU"; + char *uname = "uname"; + char *email = "email"; + int speed = -1; + char *names[NTESTS]; + + names[0] = name_1(); + names[1] = name_2(); + names[2] = name_3(); + names[3] = name_4(); + names[4] = name_5(); + names[5] = name_6(); + names[6] = name_7(); + names[7] = name_8(); + names[8] = name_9(); + + printf("/* \"%s\", \"%s\", \"%s\", %d, %.0f, %d, %f, %f */\n", + CPU_name, uname, email, speed, + mhz, get_enough(0), l_overhead(), t_overhead()); + printf("result_t* data[] = { \n"); + for (i = 0; i < NTESTS; ++i) { + printf("\t/* %s */ { %d, {", names[i], data[i].N); + for (j = 0; j < data[i].N; ++j) { + printf("\n\t\t{ /* %f */ %lu, %lu}", data[i].v[j].u / (100. * data[i].v[j].n), (unsigned long)data[i].v[j].u, (unsigned long)data[i].v[j].n); + if (j < TRIES - 1) printf(", "); + } + if (i < NTESTS - 1) printf("}},\n"); + else printf("}}\n"); + } + printf("};\n"); +} + +int +main(int ac, char **av) +{ + int c, i, j, k, mhz = -1; + double runtime; + result_t data[NTESTS]; + result_t data_save[NTESTS]; + char *usage = "[-d] [-c]\n"; + + putenv("LOOP_O=0.0"); /* should be at most 1% */ + + runtime = (NTESTS * TRIES * 3 * get_enough(0)) / 1000000.; + if (runtime > 3.) { + fprintf(stderr, "mhz: should take approximately %.0f seconds\n", runtime); + } + + /* make three efforts to get reliable data */ + for (i = 0; i < 3 && mhz < 0; ++i) { + /* initialize the data arrays */ + for (j = 0; j < NTESTS; ++j) + insertinit(&data[j]); + + /* + * collect the data; try to minimize impact of activity bursts + * by putting NTESTS in the inner loop so a burst will affect + * one data point for all expressions first, rather than all + * data points for one expression. + */ + for (j = 0; j < TRIES; ++j) { + for (k = 0; k < NTESTS; ++k) { + (*loops[k])(0); + insertsort(gettime(), get_n(), &data[k]); + } + } + save_data(data, data_save); + mhz = compute_mhz(data); + } + + while (( c = getopt(ac, av, "cd")) != EOF) { + switch(c) { + case 'c': + if (mhz > 0) { + printf("%.4f\n", 1000. / (double)mhz); + mhz = 0; + } + break; + case 'd': + print_data(mhz, data_save); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + if (mhz < 0) { + printf("-1 System too busy\n"); + exit(1); + } + + if (mhz > 0) { + printf("%d MHz, %.4f nanosec clock\n", + mhz, 1000. / (double)mhz); + } + exit(0); +} diff --git a/performance/lmbench3/src/msleep.c b/performance/lmbench3/src/msleep.c new file mode 100644 index 0000000..e605d50 --- /dev/null +++ b/performance/lmbench3/src/msleep.c @@ -0,0 +1,21 @@ +#include "bench.h" + +int +main(int ac, char **av) +{ +#if defined(sgi) || defined(sun) || defined(linux) + usleep(atoi(av[1]) * 1000); + return (0); +#else + fd_set set; + int fd; + struct timeval tv; + + tv.tv_sec = 0; + tv.tv_usec = atoi(av[1]) * 1000; + FD_ZERO(&set); + FD_SET(0, &set); + select(1, &set, 0, 0, &tv); + return (0); +#endif +} diff --git a/performance/lmbench3/src/names.h b/performance/lmbench3/src/names.h new file mode 100644 index 0000000..ea7775c --- /dev/null +++ b/performance/lmbench3/src/names.h @@ -0,0 +1,102 @@ +char *names[] = { +"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", +"k", "l", "m", "n", "o", "p", "q", "r", "s", "t", +"u", "v", "w", "x", "y", "z", "aa", "ab", "ac", "ad", +"ae", "af", "ag", "ah", "ai", "aj", "ak", "al", "am", "an", +"ao", "ap", "aq", "ar", "as", "at", "au", "av", "aw", "ax", +"ay", "az", "ba", "bb", "bc", "bd", "be", "bf", "bg", "bh", +"bi", "bj", "bk", "bl", "bm", "bn", "bo", "bp", "bq", "br", +"bs", "bt", "bu", "bv", "bw", "bx", "by", "bz", "ca", "cb", +"cc", "cd", "ce", "cf", "cg", "ch", "ci", "cj", "ck", "cl", +"cm", "cn", "co", "cp", "cq", "cr", "cs", "ct", "cu", "cv", +"cw", "cx", "cy", "cz", "da", "db", "dc", "dd", "de", "df", +"dg", "dh", "di", "dj", "dk", "dl", "dm", "dn", "do", "dp", +"dq", "dr", "ds", "dt", "du", "dv", "dw", "dx", "dy", "dz", +"ea", "eb", "ec", "ed", "ee", "ef", "eg", "eh", "ei", "ej", +"ek", "el", "em", "en", "eo", "ep", "eq", "er", "es", "et", +"eu", "ev", "ew", "ex", "ey", "ez", "fa", "fb", "fc", "fd", +"fe", "ff", "fg", "fh", "fi", "fj", "fk", "fl", "fm", "fn", +"fo", "fp", "fq", "fr", "fs", "ft", "fu", "fv", "fw", "fx", +"fy", "fz", "ga", "gb", "gc", "gd", "ge", "gf", "gg", "gh", +"gi", "gj", "gk", "gl", "gm", "gn", "go", "gp", "gq", "gr", +"gs", "gt", "gu", "gv", "gw", "gx", "gy", "gz", "ha", "hb", +"hc", "hd", "he", "hf", "hg", "hh", "hi", "hj", "hk", "hl", +"hm", "hn", "ho", "hp", "hq", "hr", "hs", "ht", "hu", "hv", +"hw", "hx", "hy", "hz", "ia", "ib", "ic", "id", "ie", "if", +"ig", "ih", "ii", "ij", "ik", "il", "im", "in", "io", "ip", +"iq", "ir", "is", "it", "iu", "iv", "iw", "ix", "iy", "iz", +"ja", "jb", "jc", "jd", "je", "jf", "jg", "jh", "ji", "jj", +"jk", "jl", "jm", "jn", "jo", "jp", "jq", "jr", "js", "jt", +"ju", "jv", "jw", "jx", "jy", "jz", "ka", "kb", "kc", "kd", +"ke", "kf", "kg", "kh", "ki", "kj", "kk", "kl", "km", "kn", +"ko", "kp", "kq", "kr", "ks", "kt", "ku", "kv", "kw", "kx", +"ky", "kz", "la", "lb", "lc", "ld", "le", "lf", "lg", "lh", +"li", "lj", "lk", "ll", "lm", "ln", "lo", "lp", "lq", "lr", +"ls", "lt", "lu", "lv", "lw", "lx", "ly", "lz", "ma", "mb", +"mc", "md", "me", "mf", "mg", "mh", "mi", "mj", "mk", "ml", +"mm", "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", +"mw", "mx", "my", "mz", "na", "nb", "nc", "nd", "ne", "nf", +"ng", "nh", "ni", "nj", "nk", "nl", "nm", "nn", "no", "np", +"nq", "nr", "ns", "nt", "nu", "nv", "nw", "nx", "ny", "nz", +"oa", "ob", "oc", "od", "oe", "of", "og", "oh", "oi", "oj", +"ok", "ol", "om", "on", "oo", "op", "oq", "or", "os", "ot", +"ou", "ov", "ow", "ox", "oy", "oz", "pa", "pb", "pc", "pd", +"pe", "pf", "pg", "ph", "pi", "pj", "pk", "pl", "pm", "pn", +"po", "pp", "pq", "pr", "ps", "pt", "pu", "pv", "pw", "px", +"py", "pz", "qa", "qb", "qc", "qd", "qe", "qf", "qg", "qh", +"qi", "qj", "qk", "ql", "qm", "qn", "qo", "qp", "qq", "qr", +"qs", "qt", "qu", "qv", "qw", "qx", "qy", "qz", "ra", "rb", +"rc", "rd", "re", "rf", "rg", "rh", "ri", "rj", "rk", "rl", +"rm", "rn", "ro", "rp", "rq", "rr", "rs", "rt", "ru", "rv", +"rw", "rx", "ry", "rz", "sa", "sb", "sc", "sd", "se", "sf", +"sg", "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sp", +"sq", "sr", "ss", "st", "su", "sv", "sw", "sx", "sy", "sz", +"ta", "tb", "tc", "td", "te", "tf", "tg", "th", "ti", "tj", +"tk", "tl", "tm", "tn", "to", "tp", "tq", "tr", "ts", "tt", +"tu", "tv", "tw", "tx", "ty", "tz", "ua", "ub", "uc", "ud", +"ue", "uf", "ug", "uh", "ui", "uj", "uk", "ul", "um", "un", +"uo", "up", "uq", "ur", "us", "ut", "uu", "uv", "uw", "ux", +"uy", "uz", "va", "vb", "vc", "vd", "ve", "vf", "vg", "vh", +"vi", "vj", "vk", "vl", "vm", "vn", "vo", "vp", "vq", "vr", +"vs", "vt", "vu", "vv", "vw", "vx", "vy", "vz", "wa", "wb", +"wc", "wd", "we", "wf", "wg", "wh", "wi", "wj", "wk", "wl", +"wm", "wn", "wo", "wp", "wq", "wr", "ws", "wt", "wu", "wv", +"ww", "wx", "wy", "wz", "xa", "xb", "xc", "xd", "xe", "xf", +"xg", "xh", "xi", "xj", "xk", "xl", "xm", "xn", "xo", "xp", +"xq", "xr", "xs", "xt", "xu", "xv", "xw", "xx", "xy", "xz", +"ya", "yb", "yc", "yd", "ye", "yf", "yg", "yh", "yi", "yj", +"yk", "yl", "ym", "yn", "yo", "yp", "yq", "yr", "ys", "yt", +"yu", "yv", "yw", "yx", "yy", "yz", "za", "zb", "zc", "zd", +"ze", "zf", "zg", "zh", "zi", "zj", "zk", "zl", "zm", "zn", +"zo", "zp", "zq", "zr", "zs", "zt", "zu", "zv", "zw", "zx", +"zy", "zz", "aaa", "aab", "aac", "aad", "aae", "aaf", "aag", "aah", +"aai", "aaj", "aak", "aal", "aam", "aan", "aao", "aap", "aaq", "aar", +"aas", "aat", "aau", "aav", "aaw", "aax", "aay", "aaz", "aba", "abb", +"abc", "abd", "abe", "abf", "abg", "abh", "abi", "abj", "abk", "abl", +"abm", "abn", "abo", "abp", "abq", "abr", "abs", "abt", "abu", "abv", +"abw", "abx", "aby", "abz", "aca", "acb", "acc", "acd", "ace", "acf", +"acg", "ach", "aci", "acj", "ack", "acl", "acm", "acn", "aco", "acp", +"acq", "acr", "acs", "act", "acu", "acv", "acw", "acx", "acy", "acz", +"ada", "adb", "adc", "add", "ade", "adf", "adg", "adh", "adi", "adj", +"adk", "adl", "adm", "adn", "ado", "adp", "adq", "adr", "ads", "adt", +"adu", "adv", "adw", "adx", "ady", "adz", "aea", "aeb", "aec", "aed", +"aee", "aef", "aeg", "aeh", "aei", "aej", "aek", "ael", "aem", "aen", +"aeo", "aep", "aeq", "aer", "aes", "aet", "aeu", "aev", "aew", "aex", +"aey", "aez", "afa", "afb", "afc", "afd", "afe", "aff", "afg", "afh", +"afi", "afj", "afk", "afl", "afm", "afn", "afo", "afp", "afq", "afr", +"afs", "aft", "afu", "afv", "afw", "afx", "afy", "afz", "aga", "agb", +"agc", "agd", "age", "agf", "agg", "agh", "agi", "agj", "agk", "agl", +"agm", "agn", "ago", "agp", "agq", "agr", "ags", "agt", "agu", "agv", +"agw", "agx", "agy", "agz", "aha", "ahb", "ahc", "ahd", "ahe", "ahf", +"ahg", "ahh", "ahi", "ahj", "ahk", "ahl", "ahm", "ahn", "aho", "ahp", +"ahq", "ahr", "ahs", "aht", "ahu", "ahv", "ahw", "ahx", "ahy", "ahz", +"aia", "aib", "aic", "aid", "aie", "aif", "aig", "aih", "aii", "aij", +"aik", "ail", "aim", "ain", "aio", "aip", "aiq", "air", "ais", "ait", +"aiu", "aiv", "aiw", "aix", "aiy", "aiz", "aja", "ajb", "ajc", "ajd", +"aje", "ajf", "ajg", "ajh", "aji", "ajj", "ajk", "ajl", "ajm", "ajn", +"ajo", "ajp", "ajq", "ajr", "ajs", "ajt", "aju", "ajv", "ajw", "ajx", +"ajy", "ajz", "aka", "akb", "akc", "akd", "ake", "akf", "akg", "akh", +"aki", "akj", "akk", "akl", "akm", "akn", "ako", "akp", "akq", "akr", +"aks", "akt", "aku", "akv", "akw", "akx", "aky", "akz", "ala", "alb", +"alc", "ald", "ale", "alf", "alg", "alh", "ali", "alj", "alk", "all", +}; diff --git a/performance/lmbench3/src/par_mem.c b/performance/lmbench3/src/par_mem.c new file mode 100644 index 0000000..2bb78e6 --- /dev/null +++ b/performance/lmbench3/src/par_mem.c @@ -0,0 +1,81 @@ +/* + * par_mem.c - determine the memory hierarchy parallelism + * + * usage: par_mem [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +void compute_times(struct mem_state* state, double* tlb_time, double* cache_time); + + +/* + * Assumptions: + * + * 1) Cache lines are a multiple of pointer-size words + * 2) Cache lines are no larger than 1/8 of a page (typically 512 bytes) + * 3) Pages are an even multiple of cache lines + */ +int +main(int ac, char **av) +{ + int i; + int c; + int warmup = 0; + int repetitions = TRIES; + int print_cost = 0; + size_t len; + size_t maxlen = 64 * 1024 * 1024; + double par; + struct mem_state state; + char *usage = "[-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]\n"; + + state.line = getpagesize() / 16; + state.pagesize = getpagesize(); + + while (( c = getopt(ac, av, "cL:M:W:N:")) != EOF) { + switch(c) { + case 'c': + print_cost = 1; + break; + case 'L': + state.line = atoi(optarg); + if (state.line < sizeof(char*)) + state.line = sizeof(char*); + break; + case 'M': + maxlen = bytes(optarg); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + for (i = MAX_MEM_PARALLELISM * state.line; i <= maxlen; i<<=1) { + par = par_mem(i, warmup, repetitions, &state); + + if (par > 0.) { + fprintf(stderr, "%.6f %.2f\n", + i / (1000. * 1000.), par); + } + } + + exit(0); +} + + diff --git a/performance/lmbench3/src/par_ops.c b/performance/lmbench3/src/par_ops.c new file mode 100644 index 0000000..1b79615 --- /dev/null +++ b/performance/lmbench3/src/par_ops.c @@ -0,0 +1,501 @@ +/* + * par_ops.c - benchmark of simple operation parallelism + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +struct _state { + int N; + int M; + int K; + int* int_data; + double* double_data; +}; + +void initialize(iter_t iterations, void* cookie); +void cleanup(iter_t iterations, void* cookie); + +#define FIVE(m) m m m m m +#define TEN(m) FIVE(m) FIVE(m) +#define FIFTY(m) TEN(m) TEN(m) TEN(m) TEN(m) TEN(m) +#define HUNDRED(m) FIFTY(m) FIFTY(m) + +#define MAX_LOAD_PARALLELISM 16 + +double +max_parallelism(benchmp_f* benchmarks, + int warmup, int repetitions, void* cookie) +{ + int i, j, k; + double baseline, max_load_parallelism, load_parallelism; + result_t *results, *r_save; + + max_load_parallelism = 1.; + + for (i = 0; i < MAX_LOAD_PARALLELISM; ++i) { + benchmp(initialize, benchmarks[i], cleanup, + 0, 1, warmup, repetitions, cookie); + save_minimum(); + + if (gettime() == 0) + return -1.; + + if (i == 0) { + baseline = (double)gettime() / (double)get_n(); + } else { + load_parallelism = baseline; + load_parallelism /= (double)gettime(); + load_parallelism *= (double)((i + 1) * get_n()); + if (load_parallelism > max_load_parallelism) { + max_load_parallelism = load_parallelism; + } + } + } + return max_load_parallelism; +} + +#define REPEAT_0(m) m(0) +#define REPEAT_1(m) REPEAT_0(m) m(1) +#define REPEAT_2(m) REPEAT_1(m) m(2) +#define REPEAT_3(m) REPEAT_2(m) m(3) +#define REPEAT_4(m) REPEAT_3(m) m(4) +#define REPEAT_5(m) REPEAT_4(m) m(5) +#define REPEAT_6(m) REPEAT_5(m) m(6) +#define REPEAT_7(m) REPEAT_6(m) m(7) +#define REPEAT_8(m) REPEAT_7(m) m(8) +#define REPEAT_9(m) REPEAT_8(m) m(9) +#define REPEAT_10(m) REPEAT_9(m) m(10) +#define REPEAT_11(m) REPEAT_10(m) m(11) +#define REPEAT_12(m) REPEAT_11(m) m(12) +#define REPEAT_13(m) REPEAT_12(m) m(13) +#define REPEAT_14(m) REPEAT_13(m) m(14) +#define REPEAT_15(m) REPEAT_14(m) m(15) + +#define BENCHMARK(benchmark,N,repeat) \ +void benchmark##_##N(iter_t iterations, void *cookie) \ +{ \ + register iter_t i = iterations; \ + struct _state* state = (struct _state*)cookie; \ + repeat(DECLARE); \ + \ + repeat(INIT); \ + while (i-- > 0) { \ + repeat(PREAMBLE); \ + TEN(repeat(BODY)); \ + } \ + \ + repeat(SAVE); \ +} + +#define PARALLEL_BENCHMARKS(benchmark) \ + BENCHMARK(benchmark, 0, REPEAT_0) \ + BENCHMARK(benchmark, 1, REPEAT_1) \ + BENCHMARK(benchmark, 2, REPEAT_2) \ + BENCHMARK(benchmark, 3, REPEAT_3) \ + BENCHMARK(benchmark, 4, REPEAT_4) \ + BENCHMARK(benchmark, 5, REPEAT_5) \ + BENCHMARK(benchmark, 6, REPEAT_6) \ + BENCHMARK(benchmark, 7, REPEAT_7) \ + BENCHMARK(benchmark, 8, REPEAT_8) \ + BENCHMARK(benchmark, 9, REPEAT_9) \ + BENCHMARK(benchmark, 10, REPEAT_10) \ + BENCHMARK(benchmark, 11, REPEAT_11) \ + BENCHMARK(benchmark, 12, REPEAT_12) \ + BENCHMARK(benchmark, 13, REPEAT_13) \ + BENCHMARK(benchmark, 14, REPEAT_14) \ + BENCHMARK(benchmark, 15, REPEAT_15) \ + \ + benchmp_f benchmark##_benchmarks[] = { \ + benchmark##_0, \ + benchmark##_1, \ + benchmark##_2, \ + benchmark##_3, \ + benchmark##_4, \ + benchmark##_5, \ + benchmark##_6, \ + benchmark##_7, \ + benchmark##_8, \ + benchmark##_9, \ + benchmark##_10, \ + benchmark##_11, \ + benchmark##_12, \ + benchmark##_13, \ + benchmark##_14, \ + benchmark##_15 \ + }; + +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N ^= s##N; s##N ^= r##N; r##N |= s##N; +#define DECLARE(N) register int r##N, s##N; +#define INIT(N) r##N = state->int_data[N] + 1; s##N = (N+1) + r##N; +#define PREAMBLE(N) +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(integer_bit) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) a##N += b##N; b##N -= a##N; +#define DECLARE(N) register int a##N, b##N; +#define INIT(N) a##N = state->int_data[N] + 57; \ + a##N = state->int_data[N] + 31; +#define PREAMBLE(N) +#define SAVE(N) use_int(a##N + b##N); +PARALLEL_BENCHMARKS(integer_add) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N *= s##N; +#define DECLARE(N) register int r##N, s##N, t##N; +#define INIT(N) r##N = state->int_data[N] - N + 1 + 37431; \ + s##N = state->int_data[N] - N + 1 + 4; \ + t##N = r##N * s##N * s##N * s##N * s##N * s##N * \ + s##N * s##N * s##N * s##N * s##N - r##N; \ + r##N += t##N; +#define PREAMBLE(N) r##N -= t##N; +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(integer_mul) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N = (s##N / r##N); +#define DECLARE(N) register int r##N, s##N; +#define INIT(N) r##N = state->int_data[N] - N + 1 + 36; \ + s##N = (r##N + 1) << 20; +#define PREAMBLE(N) +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(integer_div) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N %= s##N; r##N |= s##N; +#define DECLARE(N) register int r##N, s##N; +#define INIT(N) r##N = state->int_data[N] - N + 1 + iterations; \ + s##N = state->int_data[N] - N + 1 + 62; +#define PREAMBLE(N) +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(integer_mod) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N ^= i##N; s##N ^= r##N; r##N |= s##N; +#define DECLARE(N) register int64 r##N, s##N, i##N; +#define INIT(N) r##N = state->int_data[N] - N + 1; \ + r##N |= r##N << 32; \ + s##N = iterations + state->int_data[N] - N + 1; \ + s##N |= s##N << 32; \ + i##N = (s##N << 2) - (int64)1; +#define PREAMBLE(N) i##N -= 1; +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(int64_bit) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) a##N += b##N; b##N -= a##N; +#define DECLARE(N) register int64 a##N, b##N; +#define INIT(N) a##N = state->int_data[N] - N + 1 + 37420; \ + a##N += (int64)(0xFE + state->int_data[N] - N + 1)<<30; \ + b##N = state->int_data[N] - N + 1 + 21698324; \ + b##N += (int64)(0xFFFE + state->int_data[N] - N + 1)<<29; +#define PREAMBLE(N) +#define SAVE(N) use_int((int)a##N + (int)b##N); +PARALLEL_BENCHMARKS(int64_add) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N = (r##N * s##N); +#define DECLARE(N) register int64 r##N, s##N, t##N; +#define INIT(N) r##N = state->int_data[N] - N + 1 + 37420; \ + r##N += (int64)(state->int_data[N] - N + 1 + 6)<<32; \ + s##N = state->int_data[N] - N + 1 + 4; \ + t##N = r##N * s##N * s##N * s##N * s##N * s##N * \ + s##N * s##N * s##N * s##N * s##N - r##N; \ + r##N += t##N; +#define PREAMBLE(N) r##N -= t##N; +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(int64_mul) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N = (s##N / r##N); +#define DECLARE(N) register int64 r##N, s##N; +#define INIT(N) r##N = state->int_data[N] - N + 37; \ + r##N += r##N << 33; \ + s##N = (r##N + 17) << 13; +#define PREAMBLE(N) +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(int64_div) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N = (s##N % r##N) ^ r##N; +#define DECLARE(N) register int64 r##N, s##N; +#define INIT(N) r##N = (int64)state->int_data[N]; s##N = 0; +#define PREAMBLE(N) s##N++; +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(int64_mod) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N += r##N; +#define DECLARE(N) register float r##N, s##N; +#define INIT(N) r##N = (float)state->double_data[N] + 1023.0; \ + s##N = (float)state->K; +#define PREAMBLE(N) r##N += s##N; +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(float_add) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N *= r##N; r##N *= s##N; +#define DECLARE(N) register float r##N, s##N; +#define INIT(N) r##N = 8.0f * (float)state->double_data[N]; \ + s##N = 0.125 * (float)state->M * state->double_data[N] / 1000.0; +#define PREAMBLE(N) +#define SAVE(N) use_int((int)r##N); use_int((int)s##N); +PARALLEL_BENCHMARKS(float_mul) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N = s##N / r##N; +#define DECLARE(N) register float r##N, s##N; +#define INIT(N) r##N = 1.41421356f * (float)state->double_data[N]; \ + s##N = 3.14159265f * (float)(state->int_data[N] - N + 1); +#define PREAMBLE(N) +#define SAVE(N) use_int((int)r##N); use_int((int)s##N); +PARALLEL_BENCHMARKS(float_div) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N += r##N; +#define DECLARE(N) register double r##N, s##N; +#define INIT(N) r##N = state->double_data[N] + 1023.; \ + s##N = (double)state->K; +#define PREAMBLE(N) r##N += s##N; +#define SAVE(N) use_int((int)r##N); +PARALLEL_BENCHMARKS(double_add) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N *= r##N; r##N *= s##N; +#define DECLARE(N) register double r##N, s##N; +#define INIT(N) r##N = 8.0f * state->double_data[N]; \ + s##N = 0.125 * (double)state->M * state->double_data[N] / 1000.0; +#define PREAMBLE(N) +#define SAVE(N) use_int((int)r##N); use_int((int)s##N); +PARALLEL_BENCHMARKS(double_mul) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + +#define BODY(N) r##N = s##N / r##N; +#define DECLARE(N) register double r##N, s##N; +#define INIT(N) r##N = 1.41421356 * state->double_data[N]; \ + s##N = 3.14159265 * (double)(state->int_data[N] - N + 1); +#define PREAMBLE(N) +#define SAVE(N) use_int((int)r##N); use_int((int)s##N); +PARALLEL_BENCHMARKS(double_div) +#undef BODY +#undef DECLARE +#undef INIT +#undef PREAMBLE +#undef SAVE + + +void +initialize(iter_t iterations, void* cookie) +{ + struct _state *state = (struct _state*)cookie; + register int i; + + if (iterations) return; + + state->int_data = (int*)malloc(MAX_LOAD_PARALLELISM * sizeof(int)); + state->double_data = (double*)malloc(MAX_LOAD_PARALLELISM * sizeof(double)); + + for (i = 0; i < MAX_LOAD_PARALLELISM; ++i) { + state->int_data[i] = i+1; + state->double_data[i] = 1.; + } +} + +void +cleanup(iter_t iterations, void* cookie) +{ + struct _state *state = (struct _state*)cookie; + + if (iterations) return; + + free(state->int_data); + free(state->double_data); +} + + +int +main(int ac, char **av) +{ + int c; + int warmup = 0; + int repetitions = TRIES; + double par; + struct _state state; + char *usage = "[-W <warmup>] [-N <repetitions>]\n"; + + state.N = 1; + state.M = 1000; + state.K = -1023; + + while (( c = getopt(ac, av, "W:N:")) != EOF) { + switch(c) { + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + par = max_parallelism(integer_bit_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "integer bit parallelism: %.2f\n", par); + + par = max_parallelism(integer_add_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "integer add parallelism: %.2f\n", par); + + par = max_parallelism(integer_mul_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "integer mul parallelism: %.2f\n", par); + + par = max_parallelism(integer_div_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "integer div parallelism: %.2f\n", par); + + par = max_parallelism(integer_mod_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "integer mod parallelism: %.2f\n", par); + + par = max_parallelism(int64_bit_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "int64 bit parallelism: %.2f\n", par); + + par = max_parallelism(int64_add_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "int64 add parallelism: %.2f\n", par); + + par = max_parallelism(int64_mul_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "int64 mul parallelism: %.2f\n", par); + + par = max_parallelism(int64_div_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "int64 div parallelism: %.2f\n", par); + + par = max_parallelism(int64_mod_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "int64 mod parallelism: %.2f\n", par); + + par = max_parallelism(float_add_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "float add parallelism: %.2f\n", par); + + par = max_parallelism(float_mul_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "float mul parallelism: %.2f\n", par); + + par = max_parallelism(float_div_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "float div parallelism: %.2f\n", par); + + par = max_parallelism(double_add_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "double add parallelism: %.2f\n", par); + + par = max_parallelism(double_mul_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "double mul parallelism: %.2f\n", par); + + par = max_parallelism(double_div_benchmarks, + warmup, repetitions, &state); + if (par > 0.) + fprintf(stderr, "double div parallelism: %.2f\n", par); + + + return(0); +} + diff --git a/performance/lmbench3/src/rhttp.c b/performance/lmbench3/src/rhttp.c new file mode 100644 index 0000000..0213050 --- /dev/null +++ b/performance/lmbench3/src/rhttp.c @@ -0,0 +1,125 @@ +/* + * rhttp.c - simple HTTP transaction latency test + * + * usage: rhttp hostname [port] remote-clients -p file file + * + * This turns into a bunch of + * rsh remote http hostname file file file [port] + * with the results aggragated and reported. + * + * The program "http" must be in your path on the remote machine. + * + * XXX - the way this should work is like so: + * parent process reading file names from stdin + * multiple child processes connected to the parent process + * while more file names + * wait for a child process to be idle + * feed it ~10 filenames + * the child processes need to be able to tell the parent that they + * want more work. They also need to pass back the results. + * + * Copyright (c) 1994-1997 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Silicon Graphics is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +int +main(int ac, char **av) +{ + char *name = av[0], *server, *prog; + int i, j; + uint64 total = 0; + uint64 usecs = 0; + char *args[1024]; + + if (ac < 5) { +usage: fprintf(stderr, + "Usage: %s hostname [port] remote-clients -p file ...\n", + name); + exit(1); + } + server = av[1]; + av++, ac--; /* eat server */ + if (atoi(av[1]) != 0) { + prog = av[1]; + av++, ac--; /* eat port */ + } else { + prog = "80"; /* http */ + } + for (i = 1; i < ac; ++i) { + if (!strcmp("-p", av[i])) { + i++; + break; + } + } + args[0] = "rsh"; + args[2] = "http"; + args[3] = server; + j = 4; + while (i < ac) { + args[j++] = av[i++]; + } + args[j++] = prog; + args[j] = 0; + for (i = 1; i < ac; ++i) { + if (!strcmp("-p", av[i])) { + break; + } + args[1] = av[i]; + for (j = 0; args[j]; j++) { + printf("%s ", args[j]); + } + printf("\n"); + if (fork() == 0) { + char name[30]; + + sprintf(name, "/tmp/rhttp%d", i); + creat(name, 0666); + close(2); + dup(1); + execvp(args[0], args); + perror(args[0]); + exit(1); + } + } + for (i = 1; i < ac; ++i) { + if (!strcmp("-p", av[i])) { + break; + } + wait(0); + } + system("cat /tmp/rhttp*; rm /tmp/rhttp*"); + exit(1); + for (i = 1; i < ac; ++i) { + int fd, n, m = 0; + float f1 = 0, f2 = 0; + char buf[30]; + + if (!strcmp("-p", av[i])) { + break; + } + sprintf(buf, "/tmp/http%d", i); + fd = open(buf, 0); + unlink(buf); + /* + * Avg xfer: 3.9KB, 235.0KB in 2038 millisecs, 115.31 KB/sec + */ + n = read(fd, buf, XFERSIZE); + buf[n] = 0; + sscanf(buf, "Avg xfer: %fKB, %fKB in %d millisecs,", + &f1, &f2, &m); + if (m > usecs) { + usecs = m; + } + total += f2; + } + total <<= 10; + usecs *= 1000; + settime(usecs); + latency((uint64)1, total); +} diff --git a/performance/lmbench3/src/seek.c b/performance/lmbench3/src/seek.c new file mode 100644 index 0000000..b78b2a8 --- /dev/null +++ b/performance/lmbench3/src/seek.c @@ -0,0 +1,65 @@ +char *id = "$Id$\n"; +/* + * Seek - calculate seeks as a function of distance. + * + * Usage: seek file size + * + * Copyright (c) 1994,1995,1996 Larry McVoy. All rights reserved. + */ + +#include "bench.h" + +#define STRIDE 1024*1024 + +main(ac, av) + int ac; + char *av[]; +{ + char buf[512]; + int disk; + off_t size; + off_t begin, end; + int usecs; + + if (ac != 3) { + exit(1); + } + if ((disk = open(av[1], 0)) == -1) { + exit(1); + } + size = atol(av[2]); + switch (av[2][strlen(av[2])-1]) { + case 'k': size <<= 10; break; + case 'K': size *= 1000; break; + case 'm': size <<= 20; break; + case 'M': size *= 1000000; break; + case 'g': size <<= 30; break; + case 'G': size *= 1000000000L; break; + } + + /* + * We flip back and forth, in strides of 1MB. + * If we have a 100MB disk, that means we do + * 1, 99, 2, 98, etc. + */ + end = size; + begin = 0; + lseek(disk, begin, 0); + read(disk, buf, sizeof(buf)); + while (end > begin) { + end -= STRIDE; + start(); + lseek(disk, end, 0); + read(disk, buf, sizeof(buf)); + usecs = stop(); + printf("%.04f %.04f\n", (end - begin) / 1000000., usecs/1000.); + + begin += STRIDE; + start(); + lseek(disk, begin, 0); + read(disk, buf, sizeof(buf)); + usecs = stop(); + printf("%.04f %.04f\n", (end - begin) / 1000000., usecs/1000.); + } + exit(0); +} diff --git a/performance/lmbench3/src/stats.h b/performance/lmbench3/src/stats.h new file mode 100644 index 0000000..c355168 --- /dev/null +++ b/performance/lmbench3/src/stats.h @@ -0,0 +1,61 @@ +#ifndef _STATS_H +#define _STATS_H + +#include "bench.h" +#include "timing.h" + +#define ABS(x) ((x) < 0 ? -(x) : (x)) + +int int_compare(const void *a, const void *b); +int uint64_compare(const void *a, const void *b); +int double_compare(const void *a, const void *b); + +typedef int (*int_stat)(int *values, int size); +typedef uint64 (*uint64_stat)(uint64 *values, int size); +typedef double (*double_stat)(double *values, int size); + +int int_median(int *values, int size); +uint64 uint64_median(uint64 *values, int size); +double double_median(double *values, int size); + +int int_mean(int *values, int size); +uint64 uint64_mean(uint64 *values, int size); +double double_mean(double *values, int size); + +int int_min(int *values, int size); +uint64 uint64_min(uint64 *values, int size); +double double_min(double *values, int size); + +int int_max(int *values, int size); +uint64 uint64_max(uint64 *values, int size); +double double_max(double *values, int size); + +double int_variance(int *values, int size); +double uint64_variance(uint64 *values, int size); +double double_variance(double *values, int size); + +double int_moment(int moment, int *values, int size); +double uint64_moment(int moment, uint64 *values, int size); +double double_moment(int moment, double *values, int size); + +double int_stderr(int *values, int size); +double uint64_stderr(uint64 *values, int size); +double double_stderr(double *values, int size); + +double int_skew(int *values, int size); +double uint64_skew(uint64 *values, int size); +double double_skew(double *values, int size); + +double int_kurtosis(int *values, int size); +double uint64_kurtosis(uint64 *values, int size); +double double_kurtosis(double *values, int size); + +double int_bootstrap_stderr(int *values, int size, int_stat f); +double uint64_bootstrap_stderr(uint64 *values, int size, uint64_stat f); +double double_bootstrap_stderr(double *values, int size, double_stat f); + +void regression(double *x, double *y, double *sig, int n, + double *a, double *b, double *sig_a, double *sig_b, + double *chi2); + +#endif /* _STATS_H */ diff --git a/performance/lmbench3/src/stream.c b/performance/lmbench3/src/stream.c new file mode 100644 index 0000000..1202f32 --- /dev/null +++ b/performance/lmbench3/src/stream.c @@ -0,0 +1,309 @@ +/* + * steam.c - lmbench version of John McCalpin's STREAM benchmark + * + * usage: stream + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +struct _state { + double* a; + double* b; + double* c; + double scalar; + int len; +}; + +void initialize(iter_t iterations, void* cookie); +void cleanup(iter_t iterations, void* cookie); + +/* These are from STREAM version 1 */ +void copy(iter_t iterations, void* cookie); +void scale(iter_t iterations, void* cookie); +void add(iter_t iterations, void* cookie); +void triad(iter_t iterations, void* cookie); + +/* These are from STREAM version 2 */ +void fill(iter_t iterations, void* cookie); +/* NOTE: copy is the same as in version 1 */ +void daxpy(iter_t iterations, void* cookie); +void sum(iter_t iterations, void* cookie); + + +/* + * Assumptions: + * + * 1) Cache lines are a multiple of pointer-size words + * 2) Cache lines are no larger than 1/4 a page size + * 3) Pages are an even multiple of cache lines + */ +int +main(int ac, char **av) +{ + int i, j, l; + int version = 1; + int parallel = 1; + int warmup = 0; + int repetitions = TRIES; + int c; + uint64 datasize; + struct _state state; + char *p; + char *usage = "[-v <stream version 1|2>] [-M <len>[K|M]] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; + + state.len = 1000 * 1000 * 3 * sizeof(double); + state.scalar = 3.0; + + while (( c = getopt(ac, av, "v:M:P:W:N:")) != EOF) { + switch(c) { + case 'v': + version = atoi(optarg); + if (version != 1 && version != 2) + lmbench_usage(ac, av, usage); + break; + case 'P': + parallel = atoi(optarg); + if (parallel <= 0) lmbench_usage(ac, av, usage); + break; + case 'M': + state.len = bytes(optarg); + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + /* ensure that we can malloc the desired space */ + while (!(p = malloc(state.len))) + state.len /= 2; + free(p); + + /* convert from bytes to array length */ + state.len /= 3 * sizeof(double); + datasize = sizeof(double) * state.len * parallel; + + if (version == 1) { + benchmp(initialize, copy, cleanup, + 0, parallel, warmup, repetitions, &state); + if (gettime() > 0) { + if (parallel <= 1) save_minimum(); + nano("STREAM copy latency", state.len * get_n()); + fprintf(stderr, "STREAM copy bandwidth: "); + mb(2 * datasize * get_n()); + } + + benchmp(initialize, scale, cleanup, + 0, parallel, warmup, repetitions, &state); + if (gettime() > 0) { + if (parallel <= 1) save_minimum(); + nano("STREAM scale latency", state.len * get_n()); + fprintf(stderr, "STREAM scale bandwidth: "); + mb(2 * datasize * get_n()); + } + + benchmp(initialize, sum, cleanup, + 0, parallel, warmup, repetitions, &state); + if (gettime() > 0) { + if (parallel <= 1) save_minimum(); + nano("STREAM add latency", state.len * get_n()); + fprintf(stderr, "STREAM add bandwidth: "); + mb(3 * datasize * get_n()); + } + + benchmp(initialize, triad, cleanup, + 0, parallel, warmup, repetitions, &state); + if (gettime() > 0) { + if (parallel <= 1) save_minimum(); + nano("STREAM triad latency", state.len * get_n()); + fprintf(stderr, "STREAM triad bandwidth: "); + mb(3 * datasize * get_n()); + } + } else { + benchmp(initialize, fill, cleanup, + 0, parallel, warmup, repetitions, &state); + if (gettime() > 0) { + if (parallel <= 1) save_minimum(); + nano("STREAM2 fill latency", state.len * get_n()); + fprintf(stderr, "STREAM2 fill bandwidth: "); + mb(datasize * get_n()); + } + + benchmp(initialize, copy, cleanup, + 0, parallel, warmup, repetitions, &state); + if (gettime() > 0) { + if (parallel <= 1) save_minimum(); + nano("STREAM2 copy latency", state.len * get_n()); + fprintf(stderr, "STREAM2 copy bandwidth: "); + mb(2 * datasize * get_n()); + } + + benchmp(initialize, daxpy, cleanup, + 0, parallel, warmup, repetitions, &state); + if (gettime() > 0) { + if (parallel <= 1) save_minimum(); + nano("STREAM2 daxpy latency", state.len * get_n()); + fprintf(stderr, "STREAM2 daxpy bandwidth: "); + mb(3 * datasize * get_n()); + } + + benchmp(initialize, sum, cleanup, + 0, parallel, warmup, repetitions, &state); + if (gettime() > 0) { + if (parallel <= 1) save_minimum(); + nano("STREAM2 sum latency", state.len * get_n()); + fprintf(stderr, "STREAM2 sum bandwidth: "); + mb(datasize * get_n()); + } + } + + return(0); +} + +void +initialize(iter_t iterations, void* cookie) +{ + int i; + struct _state* state = (struct _state*)cookie; + + if (iterations) return; + + state->a = (double*)malloc(sizeof(double) * state->len); + state->b = (double*)malloc(sizeof(double) * state->len); + state->c = (double*)malloc(sizeof(double) * state->len); + + if (state->a == NULL || state->b == NULL || state->c == NULL) { + exit(1); + } + + for (i = 0; i < state->len; ++i) { + state->a[i] = 1.; + state->b[i] = 2.; + state->c[i] = 0.; + } +} + +#define BODY(expr) \ +{ \ + register int i; \ + register int N = state->len; \ + register double* a = state->a; \ + register double* b = state->b; \ + register double* c = state->c; \ + register double scalar = state->scalar; \ + \ + state->a = state->b; \ + state->b = state->c; \ + state->c = a; \ + \ + for (i = 0; i < N; ++i) { \ + expr; \ + } \ +} + +void +copy(iter_t iterations, void *cookie) +{ + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + BODY(c[i] = a[i];) + } +} + +void +scale(iter_t iterations, void *cookie) +{ + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + BODY(b[i] = scalar * c[i];) + } +} + +void +add(iter_t iterations, void *cookie) +{ + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + BODY(c[i] = a[i] + b[i];) + } +} + +void +triad(iter_t iterations, void *cookie) +{ + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + BODY(a[i] = b[i] + scalar * c[i];) + } +} + +/* + * STREAM version 2 benchmark kernels + * + * NOTE: copy is the same as version 1's benchmark + */ +void +fill(iter_t iterations, void *cookie) +{ + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + BODY(a[i] = 0;) + } +} + +void +daxpy(iter_t iterations, void *cookie) +{ + struct _state* state = (struct _state*)cookie; + + while (iterations-- > 0) { + BODY(a[i] = a[i] + scalar * b[i];) + } +} + +void +sum(iter_t iterations, void *cookie) +{ + register double s; + struct _state* state = (struct _state*)cookie; + + s = 0.0; + while (iterations-- > 0) { + BODY(s += a[i];) + } + use_int((int)s); +} + +void +cleanup(iter_t iterations, void* cookie) +{ + struct _state* state = (struct _state*)cookie; + + if (iterations) return; + + free(state->a); + free(state->b); + free(state->c); +} + + + diff --git a/performance/lmbench3/src/timing.h b/performance/lmbench3/src/timing.h new file mode 100644 index 0000000..8757743 --- /dev/null +++ b/performance/lmbench3/src/timing.h @@ -0,0 +1,52 @@ +/* + * $Id$ + */ +#ifndef _TIMING_H +#define _TIMING_H + +char *p64(uint64 big); +char *p64sz(uint64 big); +double Delta(void); +double Now(void); +void adjust(int usec); +void bandwidth(uint64 bytes, uint64 times, int verbose); +uint64 bytes(char *s); +void context(uint64 xfers); +uint64 delta(void); +int get_enough(int); +uint64 get_n(void); +void kb(uint64 bytes); +double l_overhead(void); +char last(char *s); +void latency(uint64 xfers, uint64 size); +void mb(uint64 bytes); +void micro(char *s, uint64 n); +void micromb(uint64 mb, uint64 n); +void milli(char *s, uint64 n); +void morefds(void); +void nano(char *s, uint64 n); +uint64 now(void); +void ptime(uint64 n); +void rusage(void); +void save_n(uint64); +void settime(uint64 usecs); +void start(struct timeval *tv); +uint64 stop(struct timeval *begin, struct timeval *end); +uint64 t_overhead(void); +double timespent(void); +void timing(FILE *out); +uint64 tvdelta(struct timeval *, struct timeval *); +void tvsub(struct timeval *tdiff, struct timeval *t1, struct timeval *t0); +void use_int(int result); +void use_pointer(void *result); +uint64 usecs_spent(void); +void touch(char *buf, int size); +size_t* permutation(int max, int scale); +int cp(char* src, char* dst, mode_t mode); +long bread(void* src, long count); + +#if defined(hpux) || defined(__hpux) +int getpagesize(); +#endif + +#endif /* _TIMING_H */ diff --git a/performance/lmbench3/src/timing_o.c b/performance/lmbench3/src/timing_o.c new file mode 100644 index 0000000..7b9a42a --- /dev/null +++ b/performance/lmbench3/src/timing_o.c @@ -0,0 +1,10 @@ +#include <stdio.h> +#include "bench.h" + +int +main() +{ + putenv("LOOP_O=0.0"); + printf("%lu\n", (unsigned long)t_overhead()); + return (0); +} diff --git a/performance/lmbench3/src/tlb.c b/performance/lmbench3/src/tlb.c new file mode 100644 index 0000000..5ad13c9 --- /dev/null +++ b/performance/lmbench3/src/tlb.c @@ -0,0 +1,178 @@ +/* + * tlb.c - guess the cache line size + * + * usage: tlb [-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>] + * + * Copyright (c) 2000 Carl Staelin. + * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with + * additional restriction that results may published only if + * (1) the benchmark is unmodified, and + * (2) the version in the sccsid below is included in the report. + * Support for this development by Sun Microsystems is gratefully acknowledged. + */ +char *id = "$Id$\n"; + +#include "bench.h" + +int find_tlb(int start, int maxpages, int warmup, int repetitions, + double* tlb_time, double* cache_time, struct mem_state* state); +void compute_times(int pages, int warmup, int repetitions, + double* tlb_time, double* cache_time, struct mem_state* state); + +#define THRESHOLD 1.15 + +/* + * Assumptions: + * + * 1) Cache lines are a multiple of pointer-size words + * 2) Cache lines no larger than 1/8 a page size + * 3) Pages are an even multiple of cache lines + */ +int +main(int ac, char **av) +{ + int i, l, len, tlb, maxpages; + int c; + int print_cost = 0; + int warmup = 0; + int repetitions = TRIES; + double tlb_time, cache_time, diff; + struct mem_state state; + char *usage = "[-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]\n"; + + maxpages = 16 * 1024; + state.width = 1; + state.pagesize = getpagesize(); + state.line = sizeof(char*); + + tlb = 2; + + while (( c = getopt(ac, av, "cL:M:W:N:")) != EOF) { + switch(c) { + case 'c': + print_cost = 1; + break; + case 'L': + state.line = atoi(optarg); + break; + case 'M': + maxpages = bytes(optarg); /* max in bytes */ + maxpages /= getpagesize(); /* max in pages */ + break; + case 'W': + warmup = atoi(optarg); + break; + case 'N': + repetitions = atoi(optarg); + break; + default: + lmbench_usage(ac, av, usage); + break; + } + } + + /* assumption: no TLB will have less than 16 entries */ + tlb = find_tlb(8, maxpages, warmup, repetitions, &tlb_time, &cache_time, &state); + + if (tlb > 0) { + if (print_cost) { + compute_times(tlb * 2, warmup, repetitions, &tlb_time, &cache_time, &state); + fprintf(stderr, "tlb: %d pages %.5f nanoseconds\n", tlb, tlb_time - cache_time); + } else { + fprintf(stderr, "tlb: %d pages\n", tlb); + } + } + + /* + for (i = tlb<<1; i <= maxpages; i<<=1) { + compute_times(i, warmup, repetitions, &tlb_time, &cache_time, &state); + } + /**/ + + return(0); +} + +int +find_tlb(int start, int maxpages, int warmup, int repetitions, + double* tlb_time, double* cache_time, struct mem_state* state) +{ + int i, lower, upper; + + for (i = start; i <= maxpages; i<<=1) { + compute_times(i, warmup, repetitions, tlb_time, cache_time, state); + + if (*tlb_time / *cache_time > THRESHOLD) { + lower = i>>1; + upper = i; + i = lower + (upper - lower) / 2; + break; + } + } + + /* we can't find any tlb effect */ + if (i >= maxpages) { + state->len = 0; + return (0); + } + + /* use a binary search to locate point at which TLB effects start */ + while (lower + 1 < upper) { + compute_times(i, warmup, repetitions, tlb_time, cache_time, state); + + if (*tlb_time / *cache_time > THRESHOLD) { + upper = i; + } else { + lower = i; + } + i = lower + (upper - lower) / 2; + } + return (lower); +} + +void +compute_times(int pages, int warmup, int repetitions, + double* tlb_time, double* cache_time, struct mem_state* state) +{ + int i; + result_t tlb_results, cache_results, *r_save; + + r_save = get_results(); + insertinit(&tlb_results); + insertinit(&cache_results); + + state->len = pages * state->pagesize; + state->maxlen = pages * state->pagesize; + tlb_initialize(0, state); + if (state->initialized) { + for (i = 0; i < TRIES; ++i) { + BENCH1(mem_benchmark_0(__n, state); __n = 1;, 0); + insertsort(gettime(), get_n(), &tlb_results); + } + } + tlb_cleanup(0, state); + + state->len = pages * state->line; + state->maxlen = pages * state->line; + mem_initialize(0, state); + if (state->initialized) { + for (i = 0; i < TRIES; ++i) { + BENCH1(mem_benchmark_0(__n, state); __n = 1;, 0); + insertsort(gettime(), get_n(), &cache_results); + } + } + mem_cleanup(0, state); + + /* We want nanoseconds / load. */ + set_results(&tlb_results); + *tlb_time = (1000. * (double)gettime()) / (100. * (double)get_n()); + + /* We want nanoseconds / load. */ + set_results(&cache_results); + *cache_time = (1000. * (double)gettime()) / (100. * (double)get_n()); + set_results(r_save); + + /* + fprintf(stderr, "%d %.5f %.5f\n", pages, *tlb_time, *cache_time); + /**/ +} + diff --git a/performance/lmbench3/src/version.h b/performance/lmbench3/src/version.h new file mode 100644 index 0000000..0dc306d --- /dev/null +++ b/performance/lmbench3/src/version.h @@ -0,0 +1,2 @@ +#define MAJOR 3 +#define MINOR -4 /* negative is alpha, it "increases" */ diff --git a/performance/lmbench3/src/webpage-lm.tar b/performance/lmbench3/src/webpage-lm.tar new file mode 100644 index 0000000..1e5bc3b Binary files /dev/null and b/performance/lmbench3/src/webpage-lm.tar differ diff --git a/performance/lmbench3/src/webpage-lm/URLS b/performance/lmbench3/src/webpage-lm/URLS new file mode 100644 index 0000000..4f54841 --- /dev/null +++ b/performance/lmbench3/src/webpage-lm/URLS @@ -0,0 +1,14 @@ +./pictures/me-small.jpg +./gifs/snow-bg2.jpg +./gifs/rib_bar_wh.gif +./gifs/spam-not.gif +./gifs/pookline.gif +./gifs/blueline +./gifs/eyes.gif +./gifs/eyesleft.gif +./gifs/new.gif +./gifs/line1.gif +./gifs/cclip3.gif +./gifs/sgi_logo.gif +./index.html +./URLS diff --git a/performance/lmbench3/src/webpage-lm/gifs/blueline b/performance/lmbench3/src/webpage-lm/gifs/blueline new file mode 100644 index 0000000..868d4fe Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/blueline differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/cclip3.gif b/performance/lmbench3/src/webpage-lm/gifs/cclip3.gif new file mode 100644 index 0000000..4697447 Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/cclip3.gif differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/eyes.gif b/performance/lmbench3/src/webpage-lm/gifs/eyes.gif new file mode 100644 index 0000000..443bce7 Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/eyes.gif differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/eyesleft.gif b/performance/lmbench3/src/webpage-lm/gifs/eyesleft.gif new file mode 100644 index 0000000..6b5305b Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/eyesleft.gif differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/line1.gif b/performance/lmbench3/src/webpage-lm/gifs/line1.gif new file mode 100644 index 0000000..a8de25e Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/line1.gif differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/new.gif b/performance/lmbench3/src/webpage-lm/gifs/new.gif new file mode 100644 index 0000000..7df4823 Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/new.gif differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/pookline.gif b/performance/lmbench3/src/webpage-lm/gifs/pookline.gif new file mode 100644 index 0000000..593f7f3 Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/pookline.gif differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/rib_bar_wh.gif b/performance/lmbench3/src/webpage-lm/gifs/rib_bar_wh.gif new file mode 100644 index 0000000..02e55ff Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/rib_bar_wh.gif differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/sgi_logo.gif b/performance/lmbench3/src/webpage-lm/gifs/sgi_logo.gif new file mode 100644 index 0000000..84baa47 Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/sgi_logo.gif differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/snow-bg2.jpg b/performance/lmbench3/src/webpage-lm/gifs/snow-bg2.jpg new file mode 100644 index 0000000..3748971 Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/snow-bg2.jpg differ diff --git a/performance/lmbench3/src/webpage-lm/gifs/spam-not.gif b/performance/lmbench3/src/webpage-lm/gifs/spam-not.gif new file mode 100644 index 0000000..7e89689 Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/spam-not.gif differ diff --git a/performance/lmbench3/src/webpage-lm/index.html b/performance/lmbench3/src/webpage-lm/index.html new file mode 100644 index 0000000..ed7ee98 --- /dev/null +++ b/performance/lmbench3/src/webpage-lm/index.html @@ -0,0 +1,253 @@ +<html> +<body background=gifs/snow-bg2.jpg> +<TITLE>Larry McVoy's home page</Title> + +<p align=center> +<A HREF=http://www.eff.org> <img src="gifs/rib_bar_wh.gif"> </a> +<A HREF=http://www.cauce.org> <img src="gifs/spam-not.gif"> </a> +</p> +<img src="gifs/pookline.gif"> +<img src="gifs/blueline"> +<H1 align=center>Larry McVoy's home page</H1> +<img src="gifs/blueline"> +<img src="gifs/pookline.gif"> +<p align=center> +<A HREF="pictures/me.jpg""> +<img src="pictures/me-small.jpg"> </a> +</p> +<H1 align=center> +Notice: I'm moving to a new job. New email is lm@xxxxxxx. +</H1> +</p> +<H1 align=center> +Who am I? +</H1> +<p> +I'm an engineer for Silicon Graphics, working in the networking group. +I spend most of my time waving my hands and convincing other people they +want to work on stuff that I think is important. The name server is +an example, I got +<A HREF="/jes_engr/">John Schimmel</a> to work on that. +I'm constantly trying +to figure out how to make things go fast, which is why I wrote the +<A HREF="lmbench/lmbench.html">lmbench</a> benchmark suite. Lmbench +measures the basic building blocks of a computer system. +Occasionally, I have to +do real work, like the BDS stuff mentioned below. +</p> +<p> +I live in San Francisco and divide my time there between my girlfriend, +woodworking, playing pool, and riding motorcycles. +<H1 align=center> +Current stuff I'm working on (slides) +</H1> +<UL> +<font size=+1> +<LI> +<img src="gifs/eyes.gif"> +<A HREF="lmbench/lmbench.html"> +lmbench benchmark suite with results</A> +<img src="gifs/eyesleft.gif"> +<br> +Watch this space for a new lmbench in July. +<LI><A HREF="diskbench/diskbench.html">New disk benchmarking tools</A> +<LI><A HREF="lamed.html">New free name server architecture</A> +<img src="gifs/new.gif"> +<LI><A HREF="talks/bds.ps">Bulk Data Service: 50MB/sec over NFS</A> +<LI><A HREF="file:/hosts/neteng.engr/home1/lm/Doc/P/EIS.slides.ps"> +EIS project (SGI internal only)</A> +<LI><A HREF="file:/hosts/neteng.engr/home1/lm/Doc/P/bds.Aug28.96.ps"> +BDS marketing talk of August 28 (SGI internal only)</A> +<LI><A HREF="file:/hosts/neteng.engr/home1/lm/Doc/P/nsf.Aug28.96.ps"> +SuperHippi presentation for NSF on August 28 (SGI internal only)</A> +<LI><A HREF="file:/hosts/neteng.engr/home1/lm/Doc/P/net.futures.Sep9.96.ps"> +SGI networking roadmap (SGI internal only)</A> +</font> +</UL> +<img src="gifs/pookline.gif"> +<H1 align=center> +Papers I've written +</H1> +<UL> +<LI><A HREF="lmbench/lmbench-usenix.ps">lmbench usenix paper</A> +with <A HREF="http://http.cs.berkeley.edu/~staelin/">Carl Staelin.</a> +<LI><A HREF="papers/SunOS.ufs_clustering.ps">SunOS UFS clustering usenix paper</A> +<LI><A HREF="papers/freeos.ps">A proposal to unify Unix (sigh)</A> +<LI><A HREF="papers/gkim.ps">A parallel NFS using RPC vectors (coauthor)</A> +<LI><A HREF="papers/nvram.ps">A early paper describing high perf disks</A> +<LI><A HREF="papers/smoosh.ps">A description of the core of Sun's Teamware +source management system</A> +<LI><A HREF="papers/sunbox.netarch.ps">SparcCluster 1 architecture - VLANs +came from this</A> +<LI><A HREF="lmdd.shar">Latest lmdd benchmarking source</a> +</UL> +<img src="gifs/pookline.gif"> +<H1 align=center> +Personal stuff (lots of pictures) +</H1> +<img src="gifs/line1.gif"> +<H2>Me, my relatives, friends, etc.</H2> +<UL> +<LI> +Me and <A HREF="pictures/me+jacob.jpg">my nephew</A> +Jacob at Ocean Beach in +San Francisco. He was about 2 years old and still hadn't hit the terrible +twos, I think his Mom must have done a good job. +Here he is with <a href="pictures/annelies+jacob.jpg">his Mom</a> about +7 months pregnant. The next one turned out to be a boy +<a href="pictures/annelies+zeke.jpg">named Zeke</a>. +<LI>My brother <A HREF="pictures/chris.jpg">Chris</A> trying to look smart. +<LI>I used to be even more crazy than I am now; here's a picture of me +doing some stupid <A HREF="pictures/skating.jpg">rollerblading</A> tricks. +<LI>My <A HREF="pictures/sail.jpg">favorite</A> picture of me. +<LI>I work at <A HREF="pictures/working.jpg">home</A> a lot and this is +what that is like. My cat was pretty sick in that picture, but I nursed her back to +the land of the living. +<LI><A HREF="pictures/me1.jpg">Me studying</A>. +<LI>A really old picture of me in Mexico, with really long hair +<A HREF="pictures/juggling.jpg">juggling</A>. +</UL> +<img src="gifs/line1.gif"> +<H2>My cats</H2> +<UL> +<LI>I like cats and I have had two over the last 18 years (whew) or so. +Here's <A HREF="pictures/zoey.jpg">Zoey</A> after she's had a few. +Looks possessed, doesn't she? Here's a +<A HREF="pictures/zoey2.jpg">better</a> picture of her. Until she died around +Christmas of 1994, she had outlasted all of my girlfriends - I had her +for almost 14 years. I still miss her and sometimes look for her when +I go into the kitchen - it's weird to think she's gone. +I eventually decided not to mope over her forever and went and +found <A HREF="pictures/mama+linux.jpg">Mama cat</A> +at the pound. That's Linux running on the PC next to her, she fixes a lot +of mouse driver bugs. +Here's another picture of <A HREF="pictures/cat.jpg">Mama cat on the workbench +</a>. And one <A HREF="pictures/mama1.jpg">more</a> of here in my van - she +likes to travel, no kidding. One last <A HREF="pictures/mama2.jpg">shot</a> +of her. +<LI>November '96: Mama cat is missing. +We're still looking for her, but it has been two weeks and +it isn't looking very hopeful. +<LI>January '97: Mama cat is still missing. I go to the pound about +once a week with no luck. It sucks. +</UL> +<img src="gifs/line1.gif"> +<H2>Fishing</H2> +<UL> +<LI>I like to fly fish (yeah, I tie my own, ooh, wow) and I took +a trip with my friend John Weitz. John is a hot shot +photographer and here he is at <A HREF="pictures/jonw-pic.jpg">work</a>. +Here's John catching a <A +HREF="pictures/john-fishing.jpg">trout</A> in the Trinity Alps. This is +<A HREF="pictures/fishing.jpg">me fishing</A> in the upper Sacramento River. +John was talking some shots of a cool old +<A HREF="pictures/house.jpg">shed</a>, +so I took one too. Here's a shot that John took of +<A HREF="pictures/redneck.jpg">me sitting in the doorway</a> of that shed +(warning: it's ~60Kb). +<LI>This is the ultimate in fishing tall tails, except I have pictures +to prove it happened. I was fishing in Canada and thought I had hooked +some weeds. I was reeling it in when all at once it took off. Funny sort +of fish, it felt weird. When I get closer, I saw that I had two fish - +a little one that had hit the lure, and a big +<A HREF="pictures/pike1.jpg">Northern Pike</A> that had hit the little pike. +I thought for sure he would let go when he saw me, but I guess he was +hungry, because I +<A HREF="pictures/pike2.jpg">picked him up</A>. Pretty wild, huh? +</UL> +<img src="gifs/line1.gif"> +<H2>Wilderness</H2> +<UL> +<LI>I like to <A HREF="pictures/backpacking.jpg">backpack</a> +a lot and I have some friends that go with me. Here's +<A HREF="pictures/neail+elvis2.jpg">Neil</a> with his dog Elvis and +<A HREF="pictures/neil+elvis.jpg">here</a> they are again hard at work. +<LI>Me <A HREF="pictures/me-skiing.jpg">cross country skiing</A> in the Sierra +back country. It was a weekend trip to Ostrander Hut/Lake (cool place). +I think that is Yosemite Valley in the background, doesn't that look like +half dome to you? Here's the same +<A HREF="pictures/skiing.jpg">view</a> +about 5 years earlier with my friends +John G., Bernd N., and Andy A. +<LI>My Dad's <A HREF="pictures/canoe+cover.jpg">Mad River canoe with a +cover</A> that my sister made (pretty cool cover, if you ask me, it +kept us dry). We go canoeing in Canada quite a bit. +</UL> +<img src="gifs/line1.gif"> +<H2>Woodworking</H2> +<UL> +<LI>I am not just a computer nerd, I'm also a woodworking nerd, and I'm +especially nerdy about <A HREF="pictures/planes.jpg">hand planes.</a> Many +of those are a hundred years old, some are more than that ("they don't +make 'em like they used to" definitely applies to tools). +Here's my first effort at a real woodworking project, what else, a +<A HREF="pictures/toolbox3.jpg">toolbox</a>. Here's a view with the +<A HREF="pictures/toolbox2.jpg">drawers open.</a> The little box on top is a +jewelry box (or whatever) I made for an old girlfriend. +I live in San Francisco, in +a flat, so my workshop is out on my <A HREF="pictures/jointer.jpg">back deck</a>. +That's a small jointer in the foreground and a table saw clamped to the +rails in the background. It's a bit cramped, but it has a nice +<A HREF="pictures/jointer2.jpg">view.</a> +I finally decided to build a <A HREF="pictures/workbench.jpg">workbench.</a> +Here's the <A HREF="pictures/benchtop.jpg">benchtop</a> in the process of being +hand planed flat (lotso shavings, huh?). +<LI> +I do stuff on commission sometimes, this is my last girlfriend with a +<A HREF="pictures/bookshelf.jpg">bookshelf</a> I built for a friend at +work. It was pretty simple since it was a first try, but he liked it. +Here's another picture of the <A HREF="pictures/bookshelf2.jpg">bookshelf</a>. +<LI>Here I am proudly showing off a little +<A HREF="pictures/tv-cabinet+me2.jpg">TV cabinet</a> made out of pine with some +really interesting grain. That's the heartwood of the pine. +Here's a <A HREF="pictures/tv-cabinet.jpg">closeup</a> picture of +the cabinet. + +<LI> +Because space is tight in San Francisco, I think my next project will be +a tall, thin +<A HREF="pictures/chest.gif"> +chest of drawers</a> +sort of like a lingerie chest, only sized for guy's clothes. It's about +six feet tall by 18 inches square, which I think is about right. This was +drawn in James Clark's implementation of pic, in the groff tool suite. +Perverse, I know. +<LI> +Here is a document on <a href="papers/flattening.html">flattening</a> +hand planes, something that is frequently required for good performance. +</UL> +<img src="gifs/line1.gif"> +<H2>Amusements</H2> +<UL> +<LI>A <A href="excited.html">song</a> composed in my honor. No kidding. +It's pretty cute but you might need to know a little about Sun's internal +politics to completely get it. +<p> +<LI>A +<A HREF="javaletter.html"> +letter</a> +that Sun's lawyers recently sent. It's amazing how frigging +self centered people can be. I got yer Java right here, buddy. +<p> +A few days later, the net <A HREF="javaresp.html">responds.</a> +<p> +<li>Here are a bunch of +<A HREF="quote.html">quotes</a> +that I either liked or were attributed to me. A lot of these are pretty +nerdy engineer inside jokes, you've been warned. +</ul> +<img src="gifs/cclip3.gif"> +<p> +<img src="gifs/sgi_logo.gif" align=right> +<br> +<address> +Larry McVoy, +<a href="mailto:lm@xxxxxxx">lm@xxxxxxx</a> +</address> +<p> +Page accesses since Wed Jun 26 1996: +<img align=texttop src="/cgi-bin/Imagemap/hitcount?lm_home" +alt="[Sorry, counter is a GIF image!]"><br> +</p> + +</body></html> diff --git a/performance/lmbench3/src/webpage-lm/pictures/me-small.jpg b/performance/lmbench3/src/webpage-lm/pictures/me-small.jpg new file mode 100644 index 0000000..4205e8c Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/pictures/me-small.jpg differ diff --git a/runtests.sh b/runtests.sh index d93cc70..924df4f 100755 --- a/runtests.sh +++ b/runtests.sh @@ -65,8 +65,11 @@ destructive) testset=default fi ;; +performance) + dirlist="performance" + ;; *) - echo "supported test sets are minimal default stress or destructive" + echo "supported test sets are minimal, default, stress, destructive or performance" exit 1 esac @@ -92,19 +95,24 @@ do #TO DO: purpose file test name format testname=$testdir echo "Starting test $testname" >> $logfile - ./runtest.sh &>>$logfile - complete=$? - case $complete in - 0) - result=PASS - ;; - 3) - result=SKIP - ;; - *) - result=FAIL - esac - printf "%-65s%-8s\n" "$testname" "$result" + + if [ "$testset" == "performance" ]; then + ./runtest.sh >>$logfile + else + ./runtest.sh &>>$logfile + complete=$? + case $complete in + 0) + result=PASS + ;; + 3) + result=SKIP + ;; + *) + result=FAIL + esac + printf "%-65s%-8s\n" "$testname" "$result" + fi popd &>/dev/null done done -- To stop receiving notification emails like this one, please contact the administrator of this repository. _______________________________________________ kernel mailing list -- kernel@xxxxxxxxxxxxxxxxxxxxxxx To unsubscribe send an email to kernel-leave@xxxxxxxxxxxxxxxxxxxxxxx