[kernel-tests] 21/73: Add a performance target.

git@xxxxxxxxx (git repository hosting) · Wed, 07 Sep 2016 17:38:55 +0000

This is an automated email from the git hooks/post-receive script.

jforbes pushed a commit to branch master
in repository kernel-tests.

commit 7e2ed1c1e25b1ce470c12592e34aa271988434a2
Author: Dave Jones <davej@xxxxxxxxxx>
Date:   Thu Oct 4 14:43:52 2012 -0400

    Add a performance target.
    
    Introduce lmbench as the first perf benchmark
---
 performance/lmbench3/ACKNOWLEDGEMENTS              |   78 +
 performance/lmbench3/CHANGES                       |   82 +
 performance/lmbench3/COPYING                       |  339 ++
 performance/lmbench3/COPYING-2                     |  108 +
 performance/lmbench3/Makefile                      |   72 +
 performance/lmbench3/README                        |   23 +
 performance/lmbench3/doc/Makefile                  |  105 +
 performance/lmbench3/doc/bargraph.1                |  135 +
 performance/lmbench3/doc/benchmarks                |   68 +
 performance/lmbench3/doc/bw_allmem.tbl             |   61 +
 performance/lmbench3/doc/bw_file_rd.8              |   59 +
 performance/lmbench3/doc/bw_ipc.tbl                |   53 +
 performance/lmbench3/doc/bw_mem.8                  |   95 +
 performance/lmbench3/doc/bw_mem_rd.8               |   29 +
 performance/lmbench3/doc/bw_mmap_rd.8              |   46 +
 performance/lmbench3/doc/bw_pipe.8                 |   59 +
 performance/lmbench3/doc/bw_reread2.tbl            |   61 +
 performance/lmbench3/doc/bw_tcp.8                  |   71 +
 performance/lmbench3/doc/bw_tcp.tbl                |   57 +
 performance/lmbench3/doc/bw_unix.8                 |   48 +
 performance/lmbench3/doc/cache.8                   |   49 +
 performance/lmbench3/doc/ctx.pic                   |  198 +
 performance/lmbench3/doc/ctx.tbl                   |   63 +
 performance/lmbench3/doc/description.ms            |  531 +++
 performance/lmbench3/doc/graph.1                   |  143 +
 performance/lmbench3/doc/lat_allmem.tbl            |   62 +
 performance/lmbench3/doc/lat_allproc.tbl           |   60 +
 performance/lmbench3/doc/lat_connect.8             |   47 +
 performance/lmbench3/doc/lat_connect.tbl           |   44 +
 performance/lmbench3/doc/lat_ctx.8                 |   95 +
 performance/lmbench3/doc/lat_disk.tbl              |   23 +
 performance/lmbench3/doc/lat_fcntl.8               |   32 +
 performance/lmbench3/doc/lat_fifo.8                |   32 +
 performance/lmbench3/doc/lat_fs.8                  |   37 +
 performance/lmbench3/doc/lat_fs.tbl                |   56 +
 performance/lmbench3/doc/lat_http.8                |   41 +
 performance/lmbench3/doc/lat_ipc.tbl               |   16 +
 performance/lmbench3/doc/lat_mem_rd.8              |   97 +
 performance/lmbench3/doc/lat_mmap.8                |   45 +
 performance/lmbench3/doc/lat_nullsys.tbl           |   58 +
 performance/lmbench3/doc/lat_ops.8                 |   37 +
 performance/lmbench3/doc/lat_pagefault.8           |   46 +
 performance/lmbench3/doc/lat_pipe.8                |   38 +
 performance/lmbench3/doc/lat_pipe.tbl              |   58 +
 performance/lmbench3/doc/lat_proc.8                |   58 +
 performance/lmbench3/doc/lat_rpc.8                 |   68 +
 performance/lmbench3/doc/lat_select.8              |   33 +
 performance/lmbench3/doc/lat_sig.8                 |   33 +
 performance/lmbench3/doc/lat_signal.tbl            |   48 +
 performance/lmbench3/doc/lat_syscall.8             |   70 +
 performance/lmbench3/doc/lat_tcp.8                 |   52 +
 performance/lmbench3/doc/lat_tcp.tbl               |   59 +
 performance/lmbench3/doc/lat_udp.8                 |   52 +
 performance/lmbench3/doc/lat_udp.tbl               |   56 +
 performance/lmbench3/doc/lat_unix.8                |   41 +
 performance/lmbench3/doc/lat_unix_connect.8        |   43 +
 performance/lmbench3/doc/line.8                    |   50 +
 performance/lmbench3/doc/lmbench.3                 |  344 ++
 performance/lmbench3/doc/lmbench.8                 |  222 ++
 performance/lmbench3/doc/lmbench3.ms               | 1853 ++++++++++
 performance/lmbench3/doc/lmbench3_arch.fig         |  119 +
 performance/lmbench3/doc/lmbench3_signals.fig      |   95 +
 performance/lmbench3/doc/lmdd.8                    |  146 +
 performance/lmbench3/doc/mem.pic                   | 2337 ++++++++++++
 performance/lmbench3/doc/memhier-color.d           |   86 +
 performance/lmbench3/doc/memhier-line.d            |   34 +
 performance/lmbench3/doc/memhier-tlb.d             |  407 +++
 performance/lmbench3/doc/memhier.ms                | 1576 ++++++++
 performance/lmbench3/doc/mhz.8                     |   29 +
 performance/lmbench3/doc/par_mem.8                 |   68 +
 performance/lmbench3/doc/par_ops.8                 |   39 +
 performance/lmbench3/doc/parallel.ms               |  385 ++
 performance/lmbench3/doc/pgraph.1                  |  155 +
 performance/lmbench3/doc/rccs.1                    |  149 +
 performance/lmbench3/doc/refdbms.keys              |   20 +
 performance/lmbench3/doc/references                |  186 +
 performance/lmbench3/doc/references-               |  175 +
 performance/lmbench3/doc/references-lmbench3       |  430 +++
 performance/lmbench3/doc/references-memhier        |  251 ++
 performance/lmbench3/doc/references-parallel       |  171 +
 performance/lmbench3/doc/references-userguide      |  338 ++
 performance/lmbench3/doc/references.private        |    7 +
 performance/lmbench3/doc/reporting.3               |   71 +
 performance/lmbench3/doc/results.3                 |   88 +
 performance/lmbench3/doc/stream.8                  |   28 +
 performance/lmbench3/doc/timing.3                  |  163 +
 performance/lmbench3/doc/tlb.8                     |   55 +
 performance/lmbench3/doc/tmac.usenix               | 1848 ++++++++++
 performance/lmbench3/doc/usenix.ol                 |  102 +
 performance/lmbench3/doc/usenix96.ms               | 1798 ++++++++++
 performance/lmbench3/doc/userguide.ms              | 3782 ++++++++++++++++++++
 performance/lmbench3/hbench-REBUTTAL               |  245 ++
 performance/lmbench3/results/Makefile              |  320 ++
 performance/lmbench3/runtest.sh                    |   13 +
 performance/lmbench3/scripts/Makefile              |    8 +
 performance/lmbench3/scripts/README                |    7 +
 performance/lmbench3/scripts/SHIT                  |  724 ++++
 performance/lmbench3/scripts/TODO                  |    3 +
 performance/lmbench3/scripts/allctx                |   71 +
 performance/lmbench3/scripts/allmem                |   69 +
 performance/lmbench3/scripts/bargraph              |  430 +++
 performance/lmbench3/scripts/bghtml                |   39 +
 performance/lmbench3/scripts/build                 |  252 ++
 performance/lmbench3/scripts/compiler              |   16 +
 performance/lmbench3/scripts/config                |    7 +
 performance/lmbench3/scripts/config-run            |  783 ++++
 performance/lmbench3/scripts/config-scaling        |  160 +
 performance/lmbench3/scripts/depend                |   28 +
 performance/lmbench3/scripts/do_ctx                |   35 +
 performance/lmbench3/scripts/getbg                 |  806 +++++
 performance/lmbench3/scripts/getbw                 |  260 ++
 performance/lmbench3/scripts/getctx                |   79 +
 performance/lmbench3/scripts/getdisk               |   69 +
 performance/lmbench3/scripts/getlist               |   31 +
 performance/lmbench3/scripts/getmax                |   73 +
 performance/lmbench3/scripts/getmem                |   69 +
 performance/lmbench3/scripts/getpercent            |  400 +++
 performance/lmbench3/scripts/getresults            |   99 +
 performance/lmbench3/scripts/getsummary            | 1089 ++++++
 performance/lmbench3/scripts/gifs                  |   33 +
 performance/lmbench3/scripts/gnu-os                | 1439 ++++++++
 performance/lmbench3/scripts/graph                 |  947 +++++
 performance/lmbench3/scripts/html-list             |  123 +
 performance/lmbench3/scripts/html-man              |   83 +
 performance/lmbench3/scripts/info                  |    7 +
 performance/lmbench3/scripts/info-template         |   42 +
 performance/lmbench3/scripts/lmbench               |  483 +++
 performance/lmbench3/scripts/make                  |   20 +
 performance/lmbench3/scripts/man2html              |  254 ++
 performance/lmbench3/scripts/mkrelease             |   23 +
 performance/lmbench3/scripts/new2oldctx            |   31 +
 performance/lmbench3/scripts/opercent              |   92 +
 performance/lmbench3/scripts/os                    |   20 +
 performance/lmbench3/scripts/output                |   10 +
 performance/lmbench3/scripts/percent               |   95 +
 performance/lmbench3/scripts/rccs                  |  733 ++++
 performance/lmbench3/scripts/results               |   39 +
 performance/lmbench3/scripts/save                  |   26 +
 performance/lmbench3/scripts/stats                 |   50 +
 performance/lmbench3/scripts/statsummary           | 1075 ++++++
 performance/lmbench3/scripts/synchronize           |   60 +
 performance/lmbench3/scripts/target                |   24 +
 performance/lmbench3/scripts/version               |   25 +
 performance/lmbench3/scripts/xroff                 |    5 +
 performance/lmbench3/src/Makefile                  |  506 +++
 performance/lmbench3/src/TODO                      |  107 +
 performance/lmbench3/src/bench.h                   |  323 ++
 performance/lmbench3/src/bk.ver                    |    1 +
 performance/lmbench3/src/busy.c                    |   10 +
 performance/lmbench3/src/bw_file_rd.c              |  192 +
 performance/lmbench3/src/bw_mem.c                  |  468 +++
 performance/lmbench3/src/bw_mmap_rd.c              |  185 +
 performance/lmbench3/src/bw_pipe.c                 |  187 +
 performance/lmbench3/src/bw_tcp.c                  |  251 ++
 performance/lmbench3/src/bw_udp.c                  |  203 ++
 performance/lmbench3/src/bw_unix.c                 |  190 +
 performance/lmbench3/src/cache.c                   |  750 ++++
 performance/lmbench3/src/clock.c                   |   24 +
 performance/lmbench3/src/disk.c                    |  310 ++
 performance/lmbench3/src/enough.c                  |   13 +
 performance/lmbench3/src/flushdisk.c               |   42 +
 performance/lmbench3/src/getopt.c                  |  154 +
 performance/lmbench3/src/hello.c                   |    8 +
 performance/lmbench3/src/lat_cmd.c                 |  100 +
 performance/lmbench3/src/lat_connect.c             |  110 +
 performance/lmbench3/src/lat_ctx.c                 |  350 ++
 performance/lmbench3/src/lat_dram_page.c           |  201 ++
 performance/lmbench3/src/lat_fcntl.c               |  224 ++
 performance/lmbench3/src/lat_fifo.c                |  165 +
 performance/lmbench3/src/lat_fs.c                  |  272 ++
 performance/lmbench3/src/lat_http.c                |  128 +
 performance/lmbench3/src/lat_mem_rd.c              |  169 +
 performance/lmbench3/src/lat_mmap.c                |  175 +
 performance/lmbench3/src/lat_ops.c                 |  485 +++
 performance/lmbench3/src/lat_pagefault.c           |  202 ++
 performance/lmbench3/src/lat_pipe.c                |  155 +
 performance/lmbench3/src/lat_pmake.c               |  158 +
 performance/lmbench3/src/lat_proc.c                |  182 +
 performance/lmbench3/src/lat_rand.c                |  120 +
 performance/lmbench3/src/lat_rpc.c                 |  285 ++
 performance/lmbench3/src/lat_select.c              |  223 ++
 performance/lmbench3/src/lat_sem.c                 |  162 +
 performance/lmbench3/src/lat_sig.c                 |  213 ++
 performance/lmbench3/src/lat_syscall.c             |  175 +
 performance/lmbench3/src/lat_tcp.c                 |  175 +
 performance/lmbench3/src/lat_udp.c                 |  207 ++
 performance/lmbench3/src/lat_unix.c                |  130 +
 performance/lmbench3/src/lat_unix_connect.c        |  102 +
 performance/lmbench3/src/lat_usleep.c              |  259 ++
 performance/lmbench3/src/lib_debug.c               |  131 +
 performance/lmbench3/src/lib_debug.h               |   10 +
 performance/lmbench3/src/lib_mem.c                 |  699 ++++
 performance/lmbench3/src/lib_mem.h                 |   60 +
 performance/lmbench3/src/lib_sched.c               |  239 ++
 performance/lmbench3/src/lib_stats.c               |  603 ++++
 performance/lmbench3/src/lib_tcp.c                 |  238 ++
 performance/lmbench3/src/lib_tcp.h                 |   12 +
 performance/lmbench3/src/lib_timing.c              | 1774 +++++++++
 performance/lmbench3/src/lib_udp.c                 |   96 +
 performance/lmbench3/src/lib_udp.h                 |   12 +
 performance/lmbench3/src/lib_unix.c                |   97 +
 performance/lmbench3/src/lib_unix.h                |    8 +
 performance/lmbench3/src/line.c                    |   68 +
 performance/lmbench3/src/lmdd.1                    |  131 +
 performance/lmbench3/src/lmdd.c                    |  893 +++++
 performance/lmbench3/src/lmhttp.c                  |  397 ++
 performance/lmbench3/src/loop_o.c                  |    8 +
 performance/lmbench3/src/memsize.c                 |  192 +
 performance/lmbench3/src/mhz.c                     |  507 +++
 performance/lmbench3/src/msleep.c                  |   21 +
 performance/lmbench3/src/names.h                   |  102 +
 performance/lmbench3/src/par_mem.c                 |   81 +
 performance/lmbench3/src/par_ops.c                 |  501 +++
 performance/lmbench3/src/rhttp.c                   |  125 +
 performance/lmbench3/src/seek.c                    |   65 +
 performance/lmbench3/src/stats.h                   |   61 +
 performance/lmbench3/src/stream.c                  |  309 ++
 performance/lmbench3/src/timing.h                  |   52 +
 performance/lmbench3/src/timing_o.c                |   10 +
 performance/lmbench3/src/tlb.c                     |  178 +
 performance/lmbench3/src/version.h                 |    2 +
 performance/lmbench3/src/webpage-lm.tar            |  Bin 0 -> 61440 bytes
 performance/lmbench3/src/webpage-lm/URLS           |   14 +
 performance/lmbench3/src/webpage-lm/gifs/blueline  |  Bin 0 -> 596 bytes
 .../lmbench3/src/webpage-lm/gifs/cclip3.gif        |  Bin 0 -> 640 bytes
 performance/lmbench3/src/webpage-lm/gifs/eyes.gif  |  Bin 0 -> 125 bytes
 .../lmbench3/src/webpage-lm/gifs/eyesleft.gif      |  Bin 0 -> 125 bytes
 performance/lmbench3/src/webpage-lm/gifs/line1.gif |  Bin 0 -> 270 bytes
 performance/lmbench3/src/webpage-lm/gifs/new.gif   |  Bin 0 -> 116 bytes
 .../lmbench3/src/webpage-lm/gifs/pookline.gif      |  Bin 0 -> 773 bytes
 .../lmbench3/src/webpage-lm/gifs/rib_bar_wh.gif    |  Bin 0 -> 752 bytes
 .../lmbench3/src/webpage-lm/gifs/sgi_logo.gif      |  Bin 0 -> 4002 bytes
 .../lmbench3/src/webpage-lm/gifs/snow-bg2.jpg      |  Bin 0 -> 3830 bytes
 .../lmbench3/src/webpage-lm/gifs/spam-not.gif      |  Bin 0 -> 1322 bytes
 performance/lmbench3/src/webpage-lm/index.html     |  253 ++
 .../lmbench3/src/webpage-lm/pictures/me-small.jpg  |  Bin 0 -> 16292 bytes
 runtests.sh                                        |   36 +-
 237 files changed, 50723 insertions(+), 14 deletions(-)

diff --git a/performance/lmbench3/ACKNOWLEDGEMENTS b/performance/lmbench3/ACKNOWLEDGEMENTS
new file mode 100644
index 0000000..788a59f
--- /dev/null
+++ b/performance/lmbench3/ACKNOWLEDGEMENTS
@@ -0,0 +1,78 @@
+LMbench was originally developed by Larry McVoy while he worked
+at Sun Microsystems.  Larry continued development while working
+at Silicon Graphics, and was joined by Carl Staelin, who works
+for Hewlett-Packard Laboratories.
+
+LMbench would not be the successful cross-platform benchmark
+that it is today without the efforts and assistance of a wide
+range of people.  From volunteers who run it on various hardware
+and report bugs, to managers who provide financial and other
+support, to peers and colleagues who request features or
+provide feedback on design elements.  All such help has been
+critical to making LMbench a success.
+
+Below is a partial list of all those people who helped support
+the development of LMbench in one form or other, such as
+benchmark suggestions, bug reports, and so forth.  All omissions
+are accidental, and if your name was not included, please accept
+our humble apologies.  
+
+The people who have helped LMbench include, in alphabetic
+order: 
+
+Ralf Baechle,
+Christian Bau,
+Nelson H. F. Beebe,
+Anton Blanchard,
+Joel Berman,
+Paul Borrill,
+Ed Bradford,
+Len Brown,
+Robert G. Brown,
+Bruce Chapman,
+Mark Culotta,
+Fred Douglis,
+Lars-Eke Eriksson,
+Josh Fisher,
+Marc Fleischmann,
+John Fort,
+Andy Glew,
+Achim Gratz,
+Richard Henderson,
+Lev Iserovich,
+Michael A. Julier,
+Frans Kaashoek,
+Brad Knowles,
+Richard Littin,
+Bil Long,
+Udi Manber,
+John Mashey, 
+David Miller,
+Dejan Milojicic,
+Ingo Molnar,
+David Mosberger, 
+Satya Nishtala, 
+Kevin Normoyle,
+Neal Nuckolls, 
+Steve Piatz,
+Tim Prince,
+James Riden,
+Sam Roberts,
+Philip Roth,
+Chris Ruemmler,
+Olli Savia, 
+Scott Schwartz,
+Wayne Scott,
+Stephan Somogyi, 
+Ratnakar Tiwari,
+Linus Torvalds,
+Dan Truong,
+Dirk Twiehaus,
+Duc Vianney, 
+Ramya Vijay,
+Hai Vo-Ba,
+David T. Wang,
+Brian Whitney,
+David Wilson,
+Mitch Wright.
+
diff --git a/performance/lmbench3/CHANGES b/performance/lmbench3/CHANGES
new file mode 100644
index 0000000..f1228a2
--- /dev/null
+++ b/performance/lmbench3/CHANGES
@@ -0,0 +1,82 @@
+lmbench3-alpha1
+	Added new benchmark line, which determines the cache line size
+
+	Added new benchmark tlb, which determines the effective TLB size.
+	Note that this may differ from the hardware TLB size due to OS
+	TLB entries and super-pages.
+
+	Added new benchmark par_mem, which determines the possible
+	speedup due to multiple memory reads progressing in parallel.
+	This number usually depends highly on the portion of the
+	memory hierarchy being probed, with higher caches generally
+	having greater parallelism.
+
+	Added new benchmark cache, which determines the number of caches,
+	their sizes, latency, and available parallelism.  It also 
+	reports the latency and available parallelism for main memory.
+
+	Added new benchmark lat_ops, which attempts to determine the
+	latency of basic operations, such as add, multiply and divide,
+	for a variety of data types, such as int, int64, float and
+	double.
+
+	Added new benchmark par_ops, which attempts to determine the
+	available scaling of the various basic operations for various
+	data types.
+
+	Added new benchmark stream, which reports memory bandwidth
+	numbers using benchmark kernels from John McCalpin's STREAM
+	and STREAM version 2 benchmarks.
+
+	Added new benchmark lat_sem, which reports SysV semaphore latency.
+
+	Added getopt() command line parsing to most benchmarks.
+
+	Added a new benchmark timing harness, benchmp(), which makes
+	it relatively easy to design and build benchmarks which
+	measure system performance under a fixed load.  It takes
+	a few parameters:
+		- initialize: a function pointer.  If this is non-NULL
+		  the function is called in the child processes after
+		  the fork but before any benchmark-related work is 
+		  done.  The function is passed a cookie from the
+		  benchmp() call.  This can be a pointer to a
+		  data structure which lets the function know what
+		  it needs to do.
+		- benchmark: a function pointer.  This function
+		  takes two parameters, an iteration count "iters", 
+		  and a cookie.  The benchmarked activity must be
+		  run "iters" times (or some integer multiple of
+		  "iters".  This function must be idempotent; ie.,
+		  the benchmark harness must be able to call it
+		  as many times as necessary.
+		- cleanup: a function pointer.  If this is non-NULL
+		  the function is called after all benchmarking is
+		  completed to cleanup any resources that may have
+		  been allocated.
+		- enough: If this is non-zero then it is the minimum
+		  amount of time, in micro-seconds, that the benchmark
+		  must be run to provide reliable results.  In most
+		  cases this is left to zero to allow the harness to
+		  autoscale the timing intervals to the system clock's
+		  resolution/accuracy.
+		- parallel: this is the number of child processes
+		  running the benchmark that should be run in parallel.
+		  This is really the load factor.
+		- warmup: a time period in micro-seconds that each
+		  child process must run the benchmarked process
+		  before any timing intervals can begin.  This is
+		  to allow the system scheduler time to settle in
+		  a parallel/distributed system before we begin
+		  measurements.  (If so desired)
+		- repetitions: If non-zero this is the number of
+		  times we need to repeat each measurement.  The
+		  default is 11.
+		- cookie: An opaque value which can be used to
+		  pass information to the initialize(), benchmark(),
+		  and cleanup() routines.
+	This new harness is now used by: bw_file_rd, bw_mem, bw_mmap_rd,
+	bw_pipe, bw_tcp, bw_unix, lat_connect, lat_ctx, lat_fcntl,
+	lat_fifo, lat_mem_rd, lat_mmap, lat_ops, lat_pagefault, lat_pipe,
+	lat_proc, lat_rpc, lat_select, lat_sem, lat_sig, lat_syscall,
+	lat_tcp, lat_udp, lat_unix, lat_unix_connect, and stream.
diff --git a/performance/lmbench3/COPYING b/performance/lmbench3/COPYING
new file mode 100644
index 0000000..a43ea21
--- /dev/null
+++ b/performance/lmbench3/COPYING
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                          675 Mass Ave, Cambridge, MA 02139, USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	Appendix: How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/performance/lmbench3/COPYING-2 b/performance/lmbench3/COPYING-2
new file mode 100644
index 0000000..3e1f7cc
--- /dev/null
+++ b/performance/lmbench3/COPYING-2
@@ -0,0 +1,108 @@
+%M% %I% %E%
+
+The set of programs and documentation known as "lmbench" are distributed
+under the Free Software Foundation's General Public License with the
+following additional restrictions (which override any conflicting
+restrictions in the GPL):
+
+1. You may not distribute results in any public forum, in any publication,
+   or in any other way if you have modified the benchmarks.  
+
+2. You may not distribute the results for a fee of any kind.  This includes
+   web sites which generate revenue from advertising.
+
+If you have modifications or enhancements that you wish included in
+future versions, please mail those to me, Larry McVoy, at lm@xxxxxxxxxxxx.
+
+=========================================================================
+
+Rationale for the publication restrictions:
+
+In summary:
+
+    a) LMbench is designed to measure enough of an OS that if you do well in
+       all catagories, you've covered latency and bandwidth in networking,
+       disks, file systems, VM systems, and memory systems.
+    b) Multiple times in the past people have wanted to report partial results.
+       Without exception, they were doing so to show a skewed view of whatever
+       it was they were measuring (for example, one OS fit small processes into
+       segments and used the segment register to switch them, getting good 
+       results, but did not want to report large process context switches 
+       because those didn't look as good).
+    c) We insist that if you formally report LMbench results, you have to
+       report all of them and make the raw results file easily available.
+       Reporting all of them means in that same publication, a pointer
+       does not count.  Formally, in this context, means in a paper,
+       on a web site, etc., but does not mean the exchange of results
+       between OS developers who are tuning a particular subsystem.
+
+We have a lot of history with benchmarking and feel strongly that there
+is little to be gained and a lot to be lost if we allowed the results
+to be published in isolation, without the complete story being told.
+
+There has been a lot of discussion about this, with people not liking this
+restriction, more or less on the freedom principle as far as I can tell.
+We're not swayed by that, our position is that we are doing the right
+thing for the OS community and will stick to our guns on this one.
+
+It would be a different matter if there were 3 other competing
+benchmarking systems out there that did what LMbench does and didn't have
+the same reporting rules.  There aren't and as long as that is the case,
+I see no reason to change my mind and lots of reasons not to do so.  I'm
+sorry if I'm a pain in the ass on this topic, but I'm doing the right
+thing for you and the sooner people realize that the sooner we can get on
+to real work.
+
+Operating system design is a largely an art of balancing tradeoffs.
+In many cases improving one part of the system has negative effects
+on other parts of the system.  The art is choosing which parts to
+optimize and which to not optimize.  Just like in computer architecture,
+you can optimize the common instructions (RISC) or the uncommon
+instructions (CISC), but in either case there is usually a cost to
+pay (in RISC uncommon instructions are more expensive than common
+instructions, and in CISC common instructions are more expensive
+than required).  The art lies in knowing which operations are 
+important and optmizing those while minimizing the impact on the
+rest of the system.  
+
+Since lmbench gives a good overview of many important system features,
+users may see the performance of the system as a whole, and can
+see where tradeoffs may have been made.  This is the driving force
+behind the publication restriction: any idiot can optimize certain
+subsystems while completely destroying overall system performance.
+If said idiot publishes *only* the numbers relating to the optimized
+subsystem, then the costs of the optimization are hidden and readers
+will mistakenly believe that the optimization is a good idea.  By
+including the publication restriction readers would be able to
+detect that the optimization improved the subsystem performance
+while damaging the rest of the system performance and would be able
+to make an informed decision as to the merits of the optimization.
+
+Note that these restrictions only apply to *publications*.  We
+intend and encourage lmbench's use during design, development,
+and tweaking of systems and applications.  If you are tuning the
+linux or BSD TCP stack, then by all means, use the networking
+benchmarks to evaluate the performance effects of various 
+modifications; Swap results with other developers; use the
+networking numbers in isolation.  The restrictions only kick
+in when you go to *publish* the results.  If you sped up the
+TCP stack by a factor of 2 and want to publish a paper with the
+various tweaks or algorithms used to accomplish this goal, then
+you can publish the networking numbers to show the improvement.
+However, the paper *must* also include the rest of the standard
+lmbench numbers to show how your tweaks may (or may not) have
+impacted the rest of the system.  The full set of numbers may
+be included in an appendix, but they *must* be included in the
+paper.
+
+This helps protect the community from adopting flawed technologies
+based on incomplete data.  It also helps protect the community from
+misleading marketing which tries to sell systems based on partial
+(skewed) lmbench performance results.  
+
+We have seen many cases in the past where partial or misleading
+benchmark results have caused great harm to the community, and
+we want to ensure that our benchmark is not used to perpetrate
+further harm and support false or misleading claims.
+
+
diff --git a/performance/lmbench3/Makefile b/performance/lmbench3/Makefile
new file mode 100644
index 0000000..77671ff
--- /dev/null
+++ b/performance/lmbench3/Makefile
@@ -0,0 +1,72 @@
+# Makefile for top level of lmbench
+# $Id: Makefile 1.17 00/05/31 16:16:15+03:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+
+# Possible things to $(MAKE):
+#
+# build		(default) go to the source directory and build the benchmark
+# results	go to the source directory and build and run the benchmark
+# rerun		run the benchmark again
+# see		see the results that came with this release
+#		Go to the results directory and read the Makefile.
+# doc.lpr	print the documentation
+# doc.x		preview the documentation (needs X, groff, pic, etc)
+# clean		go to the subdirs and $(MAKE) clean
+# get		$(MAKE) sure all files are checked out
+# shar		build a shippable shar archive
+
+SHELL=/bin/sh
+
+build: 
+	cd src && $(MAKE)
+
+results: FRC
+	cd src && $(MAKE) results
+
+rerun: 
+	cd src && $(MAKE) rerun
+
+see:
+	cd results && $(MAKE) summary percent 2>/dev/null | more
+
+doc.lpr:
+	cd doc && $(MAKE) PS && lpr *.PS
+
+doc.x:
+	cd doc && $(MAKE) x
+
+clobber clean: 
+	for i in doc src results scripts; do \
+		echo ===== $$i =====; \
+		(cd $$i && $(MAKE) clean); \
+	done
+	/bin/rm -rf bin/*
+
+get: 
+	for i in doc src results scripts; do \
+		echo ===== $$i =====; \
+		(cd $$i && bk get -q); \
+	done
+	@co -q
+
+info: 
+	for i in doc src results scripts; do \
+		echo ===== $$i =====; \
+		(cd $$i && info); \
+	done
+
+release: scripts/mkrelease
+	scripts/mkrelease
+
+scripts/mkrelease:
+	cd scripts && co mkrelease
+
+# XXX - . must be named lmbench for this to work
+shar:
+	$(MAKE) clean
+	co -q Makefile
+	$(MAKE) get
+	cd .. && \
+	find lmbench -type f -print  | egrep -v 'noship|RCS' > /tmp/FILES
+	cd .. && shar -S -a -n lmbench1.0 -L 50K < /tmp/FILES 
+
+FRC:
diff --git a/performance/lmbench3/README b/performance/lmbench3/README
new file mode 100644
index 0000000..81a505d
--- /dev/null
+++ b/performance/lmbench3/README
@@ -0,0 +1,23 @@
+README for lmbench 2alpha8 net release.
+
+To run the benchmark, you should be able to say:
+
+	cd src
+	make results
+
+If you want to see how you did compared to the other system results
+included here, say
+
+	make see
+
+Be warned that many of these benchmarks are sensitive to other things
+being run on the system, mainly from CPU cache and CPU cycle effects.
+So make sure your screen saver is not running, etc.
+
+It's a good idea to do several runs and compare the output like so
+
+	make results
+	make rerun
+	make rerun
+	make rerun
+	cd Results && make LIST=<your OS>/*
diff --git a/performance/lmbench3/doc/Makefile b/performance/lmbench3/doc/Makefile
new file mode 100644
index 0000000..6fa93cb
--- /dev/null
+++ b/performance/lmbench3/doc/Makefile
@@ -0,0 +1,105 @@
+# Makefile for lmbench doc subdir.
+# $Id: Makefile 1.20 03/03/10 10:26:17+02:00 staelin@xxxxxxxxxxxxxxxxxxxxxx $
+
+SHELL=/bin/sh
+DESC = description.ms 
+USENIX = tmac.usenix usenix96.ms 
+PIC = ctx.pic mem.pic 
+SCRIPTS = ../scripts/
+BASE=/usr/local
+MANDIR=${BASE}/man
+
+MAN = \
+	bargraph.1 graph.1 						\
+	lmbench.3 reporting.3 results.3 timing.3 			\
+	lmbench.8 mhz.8 cache.8 line.8 tlb.8 lmdd.8			\
+	lat_proc.8 lat_mmap.8 lat_ctx.8 lat_syscall.8 lat_pipe.8 	\
+	lat_http.8 lat_tcp.8 lat_udp.8 lat_rpc.8 lat_connect.8 lat_fs.8	\
+	lat_ops.8 lat_pagefault.8 lat_mem_rd.8 lat_select.8		\
+	lat_fifo.8 lat_fcntl.8 lat_sig.8 lat_unix.8 lat_unix_connect.8	\
+	bw_file_rd.8 bw_mem.8 bw_mmap_rd.8				\
+	bw_pipe.8 bw_tcp.8 bw_unix.8 					\
+	par_ops.8 par_mem.8
+
+ALL = $(DESC) $(USENIX) $(PIC) $(MAN) $(REFER) references
+
+.SUFFIXES: .pic .fig
+
+.fig.pic:
+	fig2dev -L pic $< $*.pic
+
+PS ps: $(ALL)
+	gindxbib references
+	groff -t -e -G -s -p -R $(USENIX) > USENIX.PS
+	#groff -s -p -mgs $(DESC) > DESC.PS
+	#groff -fH -man $(MAN) > MAN.PS
+
+X x: $(ALL)
+	gindxbib references
+	$(SCRIPTS)xroff -t -e -s -p -R $(USENIX) 
+	#$(SCRIPTS)xroff -s -p -mgs $(DESC) 
+	#$(SCRIPTS)xroff -man -fH $(MAN) 
+
+text: $(ALL)
+	gindxbib references
+	gsoelim usenix96.ms | sed "s/expand doublebox/center/" | \
+	sed s/doublebox// > Fixed.ms
+	groff -Tascii -t -e -s -p -R -mgs Fixed.ms 2>/dev/null | colcrt - | more
+
+userguide.ps: $(ALL) references-userguide userguide.ms \
+		lmbench3_arch.pic lmbench3_signals.pic ctx.tbl \
+		bw_allmem.tbl bw_ipc.tbl bw_reread2.tbl bw_tcp.tbl \
+		lat_allmem.tbl lat_allproc.tbl lat_connect.tbl \
+		lat_disk.tbl lat_fs.tbl lat_ipc.tbl lat_nullsys.tbl \
+		lat_pipe.tbl lat_signal.tbl lat_tcp.tbl lat_udp.tbl
+	gindxbib references-userguide
+	groff -t -e -G -s -p -R tmac.usenix userguide.ms > userguide.ps
+
+memhier.ps: $(ALL) memhier-color.d memhier-tlb.d memhier-line.d references-memhier memhier.ms
+	gindxbib references-memhier
+	groff -G -t -e -s -p -R tmac.usenix memhier.ms > memhier.ps
+#	../scripts/graph -xm -logx -small -below -nomarks -nospace memhier-color.graph > memhier-color.pic
+#	../scripts/graph -xm -logx -small -below -nomarks -nospace memhier-line.graph > memhier-line.pic
+#	../scripts/graph -logx -small -below -nomarks -nospace memhier-tlb.graph > memhier-tlb.pic
+
+lmbench3.ps: $(ALL) references-lmbench3 lmbench3.ms \
+		lmbench3_arch.pic lmbench3_signals.pic
+	gindxbib references-lmbench3
+	groff -G -t -e -s -p -R tmac.usenix lmbench3.ms > lmbench3.ps
+
+parallel.ps: $(ALL) references-parallel parallel.ms
+	gindxbib references-parallel
+	groff -G -t -e -s -p -R tmac.usenix parallel.ms > parallel.ps
+
+install: $(MAN)
+	for f in $(MAN); do \
+		for s in 1 2 3 4 5 6 7 8 9; do \
+			if [ ! -d ${MANDIR}/man$${s} ]; then \
+				mkdir -p ${MANDIR}/man$${s}; \
+			fi; \
+			base=`basename $${f} .$${s}`; \
+			if [ "$${base}.$${s}" = "$$f" ]; then \
+				cp $$f ${MANDIR}/man$${s}/; \
+			fi; \
+		done; \
+	done
+
+get: $(ALL)
+
+edit:
+	get -e -s $(ALL)
+
+$(MAN):
+	get -s $(MAN)
+
+$(PIC):
+	get -s $(PIC)
+
+$(DESC):
+	get -s $(DESC)
+
+$(USENIX):
+	get -s $(USENIX)
+
+clean:
+	/bin/rm -f *.PS XXX bw.pic memrd_bcopy_comp.pic references.i
diff --git a/performance/lmbench3/doc/bargraph.1 b/performance/lmbench3/doc/bargraph.1
new file mode 100644
index 0000000..226caa7
--- /dev/null
+++ b/performance/lmbench3/doc/bargraph.1
@@ -0,0 +1,135 @@
+.\" $Id: bargraph.1 1.1 94/11/22 23:04:09-08:00 lm@xxxxxxxxxxxxxxx $
+.TH BARGRAPH 1
+.SH NAME
+bargraph \- compile bar graphs into pic input
+.SH SYNOPSIS
+.B bargraph
+[
+.I filename
+\&.\|.\|.
+]
+.SH DESCRIPTION
+.LP
+.B bargraph
+is a perl script which
+takes a set of Y data with labels and generates a (human readable) pic script
+that will produce the bar graph.  
+The output (pic input) is commented and is designed such that you should be 
+able to go in and adjust it to fit your document should you need to do so.
+.LP
+The input data format is:
+.sp
+.nf
+.in +4
+3 foo bar 
+9 bigger foo 
+"Silly example
+.in
+.fi
+.sp
+with output like
+.sp
+.nf
+.in +2
+.ft CW
+                    bigger
+                     foo
+                 +----------+
+                 |          |
+      foo        |          |
+      bar        |          |
+  +----------+   |          |
+  |          |   |          |
+  +----------+   +----------+
+-------------------------------
+       3              9
+
+        Silly example
+.ft
+.in
+.fi
+.SH OPTIONS
+The following command line options are available
+.TP 10
+-big
+Make the x/y defaults be 7.5 inches, crank up the title size, and don't
+put a spacer at the top.  Used for printing a graph on a full page.
+.TP
+-nobox
+Do not put an outline box around the bargraph.
+.SH "CONTROL OPTIONS"
+The following may be included in the graph to control the format
+of the graph.  They must be at the beginning of a line and by themselves.
+.TP 18
+%ps <ps>
+point size.  Default is 10.
+.TP
+%ft <ft>
+font.  Default is CB.
+.TP
+%labelgap <val>
+the space in inches between fill labels.  The bars may be filled with different
+fill values (no patterns yet, pic doesn't do that).  If you want to label 
+these, the labels are labelgap inches apart.  Default is 1.5 inches.
+.TP
+%xsize <val>
+the width of the graph in inches.  Default is 7 inches.
+.TP
+%ysize <val>
+the height of the graph in inches.  Default is 6 inches.
+.TP
+%Title n|s <title>
+the title of the bargraph.  The title option is followed by a
+a "n"orth (top) or "s"outh (bottom) indicator which controls placement 
+of the title.  No default.
+.TP
+%titleplus <val>
+increases the size of the title in pointsize.  Default is 0.
+.TP
+%boxpercent <val>
+a value between 0 and 100 that controls how closely the
+bars are to each other.  A value of 100 means the bars touch.
+Default is 75.
+.TP
+%worse <D> <W>
+An idiot arrow is drawn to indicate which way is worse.  
+<D> is the direction and must be "up" or "down". 
+<W> is the location specifier and must be one of
+"n"orth, "w"est, "e"ast, "s"outh, "nw" northwest, ne, sw, se, etc.
+.TP
+%better <D> <W>
+An idiot arrow is drawn to indicate which way is better.
+<D> is the direction and must be "up" or "down". 
+<W> is the location specifier and must be one of
+"n"orth, "w"est, "e"ast, "s"outh, "nw" northwest, ne, sw, se, etc.
+.TP
+%fakemax
+pretend that one data point was this big when autoscaling.  THis
+is used to make a series of bargraphs be all drawn to the same
+scale.
+.SH "FILL CONTROL"
+Each datum may be follwed by a fill specifier as follows
+.sp .5
+.ti +.5i
+3 foo bar %fill.5
+.sp  .5
+Labels may be specified to group a set of data that all have
+the same data.  If a line appears like
+.sp .5
+.ti +.5i
+%label.5 The foo bar data
+.sp .5
+then you get a label below the graph.
+.SH "SEE ALSO"
+.BR gtroff (1),
+.BR graph (1),
+.BR gpic (1)
+.SH TODO
+Make a -horizontal option that prints the graphs the other way.
+.LP
+Hack pick to get access to postscripts stipple patterns.
+.SH BUGS
+This isn't done.  It isn't integrated with the groff preprocessor yet.
+It doesn't know about .GS/.GE thingys.  I use it to manually generate
+a pic file and then include that.  I have to talk to James to
+see if he wants it as part of the gpic stuff.
diff --git a/performance/lmbench3/doc/benchmarks b/performance/lmbench3/doc/benchmarks
new file mode 100644
index 0000000..d997811
--- /dev/null
+++ b/performance/lmbench3/doc/benchmarks
@@ -0,0 +1,68 @@
+Theme
+	Data movement and the cost thereof
+	Latency
+		Time per operation
+		CPU cycles per operation
+	Bandwidth
+		MB / sec
+		CPU cycles / MB
+	Media
+		Memory (load, bcopy)
+		Disk (randoms, sequentials)
+		File system (directory ops, sequential)
+		Network (hot potato, transfer)
+		Pipes (hot potato, transfer)
+		VM system (mmaps/munmaps, bcopy)
+	Systems
+		All Unix systems
+		Windows NT
+		VMS (?)
+		Mainframes (?)
+Memory
+        Small transfers (randoms)
+                Load latency
+        Large transfers (sequential)
+                Bcopy bandwidth
+Processes
+        Null process execution time
+        Context switching
+Misc
+        Null entry into the system
+Networking
+        Small transfers (randoms)
+                Transfers per second
+                CPU cycles per transfer
+                socket/bind/close per second
+        Large transfers (sequential)
+                MB per second
+                CPU cycles per MB
+Disks
+        Small transfers (randoms)
+                Transfers per second
+                CPU cycles per transfer
+        Large transfers (sequential)
+                MB per second
+                CPU cycles per MB
+File system
+        Small transfers (randoms)
+                Creates / second
+                Removes / second
+                Random I/O's per second in large file
+                CPU cycles per transfer
+                MB / sec when reading many related small files
+        Large files
+                MB / second read/write
+                CPU cycles per MB
+        Hardness
+                Measure fsck time?
+Virtual memory system
+        Creation
+                mmaps per second
+                munmaps per second
+                Also vary size of mapped region
+        Small transfers (randoms)
+                Random reads per second of large mmaped file
+                CPU cycles per read
+        Large transfers (cached sequential)
+                MB per second read rate
+                CPU cycles per MB
diff --git a/performance/lmbench3/doc/bw_allmem.tbl b/performance/lmbench3/doc/bw_allmem.tbl
new file mode 100644
index 0000000..a0016c0
--- /dev/null
+++ b/performance/lmbench3/doc/bw_allmem.tbl
@@ -0,0 +1,61 @@
+.KS
+.TS
+expand doublebox;
+c|c s|c s
+l|c c|c c
+l|r r|r r.
+	Bcopy	Memory
+System	\fBunrolled\fP	libc	read	write
+=
+DEC Alpha	41	39	76	78\ 
+DEC Alpha	46	46	88	91\ 
+DEC Alpha	46	45	79	91\ 
+DEC Alpha	38	40	69	84\ 
+SunOS-5.4 sun4d	22	21	47	38\ 
+DEC Alpha	36	36	55	72\ 
+DEC Alpha	38	38	64	79\ 
+SunOS-5.4 sun4m	25	23	64	51\ 
+SunOS-5.4 sun4m	24	23	59	40\ 
+SunOS-5.4 sun4d	16	14	36	28\ 
+SunOS-5.4 sun4m	31	26	80	62\ 
+Sun SC1000	17	15	38	31\ 
+Sun Ultra1	85	167	129	152\ 
+Linux alpha	40	40	74	72\ 
+Linux i686	42	57	205	56\ 
+Linux i586	30	31	61	50\ 
+Linux alpha	39	39	73	71\ 
+Unixware/i686	65	55	214	86\ 
+Linux i586	38	42	74	75\ 
+IBM Power2	242	171	205	364\ 
+IBM PowerPC	21	21	63	26\ 
+dgux mc88110	17	17	37	19\ 
+DEC Alpha	15	15	46	20\ 
+IRIX64 IP21	68	70	92	90\ 
+IRIX64-601 IP26	41	32	65	61\ 
+Linux i586	38	41	74	75\ 
+Linux i586	20	21	60	31\ 
+Linux i586	20	21	58	30\ 
+Linux i586	20	21	60	31\ 
+Linux i486	16	17	33	41\ 
+HP-UX 9000/819	55	48	97	89\ 
+FreeBSD/i586	39	42	73	83\ 
+FreeBSD/i586	38	41	65	83\ 
+FreeBSD/i586	38	41	65	83\ 
+HP-UX 9000/735	32	26	55	52\ 
+HP-UX 9000/735	32	26	54	51\ 
+FreeBSD/i586	36	40	62	83\ 
+IRIX64 IP25	53	41	87	72\ 
+IRIX64 IP19	32	34	65	67\ 
+HP-UX 9000/735	31	26	53	51\ 
+HP-UX 9000/735	32	26	53	51\ 
+HP-UX 9000/755	31	25	49	52\ 
+HP-UX 9000/770	31	33	56	61\ 
+HP-UX 9000/897	19	19	40	37\ 
+IRIX64 IP19	35	36	65	67\ 
+IRIX IP19	33	34	67	72\ 
+IRIX5.3 IP19	32	34	65	68\ 
+IRIX IP22	32	33	68	72\ 
+IRIX5.3 IP22	31	32	69	66\ 
+FreeBSD/i586	39	42	65	83\ 
+.TE
+.KE
diff --git a/performance/lmbench3/doc/bw_file_rd.8 b/performance/lmbench3/doc/bw_file_rd.8
new file mode 100644
index 0000000..487e6f4
--- /dev/null
+++ b/performance/lmbench3/doc/bw_file_rd.8
@@ -0,0 +1,59 @@
+.\" $Id: bw_file_rd.8 1.2 00/10/16 17:13:35+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH BW_FILE_RD 8 "$Date: 00/10/16 17:13:35+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+bw_file_rd \- time the reading and summing of a file
+.SH SYNOPSIS
+.B bw_file_rd
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I size
+.I file
+.SH DESCRIPTION
+.B bw_file_rd
+times the read of the specified file in 64KB blocks. Each block is summed
+up as a seried of 4 byte integers in an unrolled loop.
+Results are reported in megabytes read per second.
+.LP
+The data is not accessed in the user program; the benchmark relies on
+the operating systems read interface to have actually moved the data.
+Systems that implement page flipping may fool this benchmark.
+.LP
+The benchmark is intended to be used on a file 
+that is in memory, i.e., the benchmark is a reread benchmark.  Other
+file benchmarking can be done with 
+.BR lmdd (8).
+.LP
+The size
+specification may end with ``k'' or ``m'' to mean
+kilobytes (* 1024) or megabytes (* 1024 * 1024).
+.SH OUTPUT
+Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e.,
+.sp
+.ft CB
+8.00 25.33
+.ft
+.SH MEMORY UTILIZATION
+This benchmark can move up to three times the requested memory.  Most Unix
+systems implement the read system call as a bcopy from kernel space
+to user space.  Bcopy will use 2-3 times as much memory bandwidth:
+there is one read from the source and a write to the destionation.  The
+write usually results in a cache line read and then a write back of
+the cache line at some later point.  Memory utilization might be reduced
+by 1/3 if the processor architecture implemented ``load cache line''
+and ``store cache line'' instructions (as well as ``getcachelinesize'').
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/bw_ipc.tbl b/performance/lmbench3/doc/bw_ipc.tbl
new file mode 100644
index 0000000..b106e06
--- /dev/null
+++ b/performance/lmbench3/doc/bw_ipc.tbl
@@ -0,0 +1,53 @@
+.KS
+.TS
+expand doublebox;
+l c c c
+l r r r.
+System	bcopy	\fBpipe\fP	TCP
+=
+DEC Alpha	36	32	9\ 
+DEC Alpha	46	54	11\ 
+DEC Alpha	38	23	7\ 
+DEC Alpha	45	35	9\ 
+DEC Alpha	39	32	12\ 
+Linux alpha	39	73	9\ 
+Sun Ultra1	167	61	51\ 
+SunOS-5.4 sun4m	26	11	11\ 
+SunOS-5.4 sun4m	23	24	19\ 
+DEC Alpha	40	24	6\ 
+DEC Alpha	15	17	4\ 
+Linux alpha	40	73	9\ 
+Linux i586	42	34	7\ 
+Linux i486	17	16	6\ 
+Linux i586	31	24	3\ 
+IBM Power2	171	84	10\ 
+IBM PowerPC	21	30	17\ 
+SunOS-5.4 sun4d	14	7	8\ 
+HP-UX 9000/735	26	37	24\ 
+SunOS-5.4 sun4m	23	7	9\ 
+Linux i686	57	73	15\ 
+Linux i586	41	22	5\ 
+Linux i586	21	19	3\ 
+Linux i586	21	18	3\ 
+Linux i586	21	12	3\ 
+Sun SC1000	15	9	11\ 
+SunOS-5.4 sun4d	21	8	9\ 
+IRIX5.3 IP22	32	34	22\ 
+IRIX64-601 IP26	32	37	22\ 
+HP-UX 9000/770	33	53	21\ 
+HP-UX 9000/819	48	37	28\ 
+HP-UX 9000/755	25	38	35\ 
+IRIX IP22	33	32	7\ 
+IRIX64 IP21	70	28	19\ 
+HP-UX 9000/735	26	44	20\ 
+HP-UX 9000/735	26	42	18\ 
+HP-UX 9000/735	26	39	19\ 
+IRIX64 IP25	41	40	26\ 
+IRIX64 IP19	34	27	19\ 
+IRIX64 IP19	36	17	31\ 
+IRIX IP19	34	14	16\ 
+IRIX5.3 IP19	34	12	12\ 
+HP-UX 9000/897	19	26	17\ 
+dgux mc88110	17	8	5\ 
+.TE
+.KE
diff --git a/performance/lmbench3/doc/bw_mem.8 b/performance/lmbench3/doc/bw_mem.8
new file mode 100644
index 0000000..50ed049
--- /dev/null
+++ b/performance/lmbench3/doc/bw_mem.8
@@ -0,0 +1,95 @@
+.\" $Id: bw_mem.8 1.4 00/10/16 17:13:36+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH BW_MEM 8 "$Date: 00/10/16 17:13:36+02:00 $" "(c)1994-2000 Larry McVoy and Carl Staelin" "LMBENCH"
+.SH NAME
+bw_mem \- time memory bandwidth
+.SH SYNOPSIS
+.B bw_mem_cp
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I size
+.I rd|wr|rdwr|cp|fwr|frd|bzero|bcopy
+.I [align]
+.SH DESCRIPTION
+.B bw_mem
+allocates twice the specified amount of memory, zeros it, and then times
+the copying of the first half to the second half.  Results are reported
+in megabytes moved per second.
+.LP
+The size
+specification may end with ``k'' or ``m'' to mean
+kilobytes (* 1024) or megabytes (* 1024 * 1024).
+.SH OUTPUT
+Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e.,
+.sp
+.ft CB
+8.00 25.33
+.ft
+.LP
+There are nine different memory benchmarks in
+.BR bw_mem .
+They each measure slightly different methods for reading, writing or
+copying data.
+.TP
+.B "rd"
+measures the time to read data into the processor.  It computes the
+sum of an array of integer values.  It accesses every fourth word.
+.TP
+.B "wr"
+measures the time to write data to memory.  It assigns a constant
+value to each memory of an array of integer values.
+It accesses every fourth word.
+.TP
+.B "rdwr"
+measures the time to read data into memory and then write data to
+the same memory location.  For each element in an array it adds
+the current value to a running sum before assigning a new (constant)
+value to the element.
+It accesses every fourth word.
+.TP
+.B "cp"
+measures the time to copy data from one location to another.  It
+does an array copy: dest[i] = source[i].
+It accesses every fourth word.
+.TP
+.B "frd"
+measures the time to read data into the processor.  It computes the
+sum of an array of integer values.
+.TP
+.B "fwr"
+measures the time to write data to memory.  It assigns a constant
+value to each memory of an array of integer values.
+.TP
+.B "fcp"
+measures the time to copy data from one location to another.  It
+does an array copy: dest[i] = source[i].
+.TP
+.B "bzero"
+measures how fast the system can
+.I bzero
+memory.
+.TP
+.B "bcopy"
+measures how fast the system can
+.I bcopy
+data.
+.SH MEMORY UTILIZATION
+This benchmark can move up to three times the requested memory.  
+Bcopy will use 2-3 times as much memory bandwidth:
+there is one read from the source and a write to the destionation.  The
+write usually results in a cache line read and then a write back of
+the cache line at some later point.  Memory utilization might be reduced
+by 1/3 if the processor architecture implemented ``load cache line''
+and ``store cache line'' instructions (as well as ``getcachelinesize'').
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/bw_mem_rd.8 b/performance/lmbench3/doc/bw_mem_rd.8
new file mode 100644
index 0000000..11e5c48
--- /dev/null
+++ b/performance/lmbench3/doc/bw_mem_rd.8
@@ -0,0 +1,29 @@
+.\" $Id: bw_mem_rd.8 1.1 94/11/18 01:26:35-08:00 lm@xxxxxxxxxxxxxxx $
+.TH BW_MEM_RD 8 "$Date: 94/11/18 01:26:35-08:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+bw_mem_rd \- time memory read rate (with overhead)
+.SH SYNOPSIS
+.B bw_mem_rd
+.I size
+.SH DESCRIPTION
+.B bw_mem_rd
+allocates the specified amount of memory, zeros it, and then times the
+reading of that memory as a series of integer loads and adds.  Each
+four byte integer is loaded and added to accumulator.
+.LP
+The size
+specification may end with ``k'' or ``m'' to mean
+kilobytes (* 1024) or megabytes (* 1024 * 1024).
+.SH OUTPUT
+Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e.,
+.sp
+.ft CB
+8.00 25.33
+.ft
+.SH MEMORY UTILIZATION
+This benchmark should move approximately the reported amount of memory.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
diff --git a/performance/lmbench3/doc/bw_mmap_rd.8 b/performance/lmbench3/doc/bw_mmap_rd.8
new file mode 100644
index 0000000..1b666b9
--- /dev/null
+++ b/performance/lmbench3/doc/bw_mmap_rd.8
@@ -0,0 +1,46 @@
+.\" $Id: bw_mmap_rd.8 1.2 00/10/16 17:13:37+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH BW_MMAP_RD 8 "$Date: 00/10/16 17:13:37+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+bw_mmap_rd \- time the reading and summing of a file
+.SH SYNOPSIS
+.B bw_mmap_rd
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I size
+.I file
+.SH DESCRIPTION
+.B bw_mmap_rd
+creates a memory mapping to the file and then reads the mapping in an unrolled
+loop similar to that used in bw_mem_rd(8).
+The benchmark is intended to be used on a file 
+that is in memory, i.e., the benchmark is a reread benchmark.  Other
+file benchmarking can be done with 
+.BR lmdd (8).
+.LP
+The size
+specification may end with ``k'' or ``m'' to mean
+kilobytes (* 1024) or megabytes (* 1024 * 1024).
+.SH OUTPUT
+Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e.,
+.sp
+.ft CB
+8.00 25.33
+.ft
+.SH MEMORY UTILIZATION
+This benchmark should move approximately the reported amount of memory.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/bw_pipe.8 b/performance/lmbench3/doc/bw_pipe.8
new file mode 100644
index 0000000..ea8fdec
--- /dev/null
+++ b/performance/lmbench3/doc/bw_pipe.8
@@ -0,0 +1,59 @@
+.\" $Id: bw_pipe.8 1.2 00/10/16 17:13:38+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH BW_PIPE 8 "$Date: 00/10/16 17:13:38+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+bw_pipe \- time data movement through pipes
+.SH SYNOPSIS
+.B bw_pipe
+[
+.I "-m <message size>"
+]
+[
+.I "-M <total bytes>"
+]
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B bw_pipe
+creates a Unix pipe between two processes and moves 
+.I "total bytes"
+through the pipe in 
+.I "message size"
+chunks (note that pipes are typically sized smaller than that).  
+The default
+.I "total bytes"
+is 10MB and the default
+.I "message size"
+is 64KB.
+.SH OUTPUT
+Output format is \f(CB"Pipe bandwidth: %0.2f MB/sec\\n", megabytes_per_second\fP, i.e.,
+.sp
+.ft CB
+Pipe bandwidth: 4.87 MB/sec
+.ft
+.SH MEMORY UTILIZATION
+This benchmark can move up to six times the requested memory per process.
+There are two processes, the sender and the receiver.
+Most Unix
+systems implement the read/write system calls as a bcopy from/to kernel space
+to/from user space.  Bcopy will use 2-3 times as much memory bandwidth:
+there is one read from the source and a write to the destionation.  The
+write usually results in a cache line read and then a write back of
+the cache line at some later point.  Memory utilization might be reduced
+by 1/3 if the processor architecture implemented "load cache line"
+and "store cache line" instructions (as well as getcachelinesize).
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/bw_reread2.tbl b/performance/lmbench3/doc/bw_reread2.tbl
new file mode 100644
index 0000000..d1e4347
--- /dev/null
+++ b/performance/lmbench3/doc/bw_reread2.tbl
@@ -0,0 +1,61 @@
+.KS
+.TS
+expand doublebox;
+c|c c|c c
+l|c c|c c
+l|r r|r r.
+	Libc	\fBFile\fP	Memory	File
+System	bcopy	\fBread\fP	read	mmap
+=
+DEC Alpha	38	37	64	12\ 
+DEC Alpha	45	40	79	50\ 
+DEC Alpha	36	36	55	19\ 
+DEC Alpha	40	44	69	14\ 
+DEC Alpha	46	48	88	26\ 
+DEC Alpha	39	39	76	23\ 
+SunOS-5.4 sun4m	23	31	59	31\ 
+SunOS-5.4 sun4m	26	23	80	30\ 
+SunOS-5.4 sun4d	14	23	36	25\ 
+SunOS-5.4 sun4d	21	23	47	17\ 
+Sun SC1000	15	20	38	28\ 
+DEC Alpha	15	20	46	14\ 
+Sun Ultra1	167	85	129	101\ 
+Linux alpha	40	25	74	23\ 
+Linux i586	31	17	61	14\ 
+SunOS-5.4 sun4m	23	21	64	39\ 
+Linux alpha	39	24	73	18\ 
+Unixware/i686	55	53	214	198\ 
+Linux i586	42	23	74	9\ 
+IBM Power2	171	187	205	106\ 
+IBM PowerPC	21	40	63	51\ 
+Linux i486	17	9	33	10\ 
+IRIX64 IP21	70	65	92	72\ 
+Linux i686	57	46	205	34\ 
+IRIX64-601 IP26	32	75	65	56\ 
+Linux i586	41	21	74	13\ 
+Linux i586	21	14	60	11\ 
+Linux i586	21	14	58	10\ 
+Linux i586	21	13	60	8\ 
+HP-UX 9000/735	26	47	55	36\ 
+HP-UX 9000/819	48	64	97	41\ 
+HP-UX 9000/755	25	45	49	32\ 
+FreeBSD/i586	42	38	65	49\ 
+FreeBSD/i586	42	30	73	54\ 
+FreeBSD/i586	41	29	65	46\ 
+IRIX64 IP19	34	34	65	56\ 
+FreeBSD/i586	40	28	62	47\ 
+IRIX64 IP25	41	60	87	76\ 
+HP-UX 9000/735	26	43	53	33\ 
+HP-UX 9000/735	26	43	54	34\ 
+HP-UX 9000/735	26	43	53	35\ 
+HP-UX 9000/770	33	43	56	37\ 
+HP-UX 9000/897	19	39	40	28\ 
+FreeBSD/i586	41	29	65	50\ 
+dgux mc88110	17	16	37	13\ 
+IRIX5.3 IP22	32	32	69	44\ 
+IRIX IP19	34	39	67	43\ 
+IRIX64 IP19	36	36	65	56\ 
+IRIX5.3 IP19	34	36	65	43\ 
+IRIX IP22	33	37	68	48\ 
+.TE
+.KE
diff --git a/performance/lmbench3/doc/bw_tcp.8 b/performance/lmbench3/doc/bw_tcp.8
new file mode 100644
index 0000000..b60d2fd
--- /dev/null
+++ b/performance/lmbench3/doc/bw_tcp.8
@@ -0,0 +1,71 @@
+.\" $Id: bw_tcp.8 1.3 00/10/16 17:13:39+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH BW_TCP 1 "$Date: 00/10/16 17:13:39+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+bw_tcp \- time data movement through TCP/IP sockets
+.SH SYNOPSIS
+.B bw_tcp
+[
+.I "-m <message size>"
+]
+[
+.I "-M <total bytes>"
+]
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I "server"
+.br or
+.B bw_tcp
+.I -s
+.br or
+.B bw_tcp
+.I "-S <server>"
+.SH DESCRIPTION
+.B bw_tcp
+is a client/server program that moves data over a TCP/IP socket.  Nothing is
+done with the data on either side; 
+.I "total bytes"
+of data is moved in 
+.I "message size"
+chunks.
+.LP
+.B bw_tcp
+has three forms of usage: as a server (-s), as a client (bw_tcp localhost), and
+as a shutdown (bw_tcp -S localhost).
+.LP
+The default amount of data is 10MB.  The client form may specify a different
+amount of data.  Specifications may end with ``k'' or ``m'' to mean
+kilobytes (* 1024) or megabytes (* 1024 * 1024).
+.SH OUTPUT
+Output format is
+.ft CB
+Socket bandwidth using localhost: 2.32 MB/sec
+.ft
+.SH MEMORY UTILIZATION
+This benchmark can move up to six times the requested memory per process
+when run through the loopback device.
+There are two processes, the sender and the receiver.
+Most Unix
+systems implement the read/write system calls as a bcopy from/to kernel space
+to/from user space.  Bcopy will use 2-3 times as much memory bandwidth:
+there is one read from the source and a write to the destionation.  The
+write usually results in a cache line read and then a write back of
+the cache line at some later point.  Memory utilization might be reduced
+by 1/3 if the processor architecture implemented "load cache line"
+and "store cache line" instructions (as well as getcachelinesize).
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation 
+and Silicon Graphics, Inc.
+.SH SEE ALSO
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/bw_tcp.tbl b/performance/lmbench3/doc/bw_tcp.tbl
new file mode 100644
index 0000000..6bb1851
--- /dev/null
+++ b/performance/lmbench3/doc/bw_tcp.tbl
@@ -0,0 +1,57 @@
+.KS
+.TS
+center expand doublebox;
+l r.
+Linux alpha	8.9
+Linux i486	5.5
+Linux alpha	8.8
+Linux i586	3.2
+Linux i486	5.6
+Linux i586	2.9
+DEC Alpha	11.2
+Linux i586	3.0
+SunOS-5.4 sun4m	9.5
+SunOS-5.4 sun4m	11.0
+DEC Alpha	4.1
+DEC Alpha	6.6
+DEC Alpha	12.1
+Linux i586	3.0
+SunOS-5.4 sun4d	7.9
+SunOS-5.4 sun4d	9.1
+DEC Alpha	8.6
+DEC Alpha	6.0
+DEC Alpha	10.5
+Sun SC1000	10.9
+Linux i586	5.1
+DEC Alpha	9.2
+Linux i586	6.8
+FreeBSD/i586	0.1
+IRIX IP22	7.2
+Linux i686	14.7
+FreeBSD/i586	0.1
+SunOS-5.4 sun4m	19.5
+FreeBSD/i586	0.1
+Sun Ultra1	51.3
+FreeBSD/i586	0.2
+FreeBSD/i586	0.2
+IBM Power2	10.5
+IBM PowerPC	16.6
+dgux mc88110	4.6
+IRIX64 IP21	18.8
+IRIX IP19	16.4
+HP-UX 9000/735	18.4
+HP-UX 9000/735	19.0
+HP-UX 9000/735	23.9
+HP-UX 9000/897	16.9
+IRIX64-601 IP26	21.5
+IRIX5.3 IP22	22.1
+IRIX5.3 IP19	12.2
+IRIX64 IP19	18.8
+IRIX64 IP25	26.1
+IRIX64 IP19	30.8
+HP-UX 9000/770	20.5
+HP-UX 9000/819	27.7
+HP-UX 9000/755	35.2
+HP-UX 9000/735	19.6
+.TE
+.KE
diff --git a/performance/lmbench3/doc/bw_unix.8 b/performance/lmbench3/doc/bw_unix.8
new file mode 100644
index 0000000..1940e78
--- /dev/null
+++ b/performance/lmbench3/doc/bw_unix.8
@@ -0,0 +1,48 @@
+.\" $Id: bw_unix.8 1.4 00/10/16 17:13:40+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH BW_UNIX 8 "$Date: 00/10/16 17:13:40+02:00 $" "(c)1994-2000 Larry McVoy and Carl Staelin" "LMBENCH"
+.SH NAME
+bw_unix \- UNIX pipe bandwidth
+.SH SYNOPSIS
+.B bw_unix
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I size
+.SH DESCRIPTION
+.B bw_unix
+creates a pipe and forks a child process which keeps writing
+data to the pipe as fast as it can.  The benchmark measures
+how fast the parent process can 
+.I read 
+the data in
+.IR size -byte
+chunks from the pipe. Nothing is done with the data in either
+the parent (reader) or child (writer) processes.
+.LP
+The 
+.I size
+specification may end with ``k'' or ``m'' to mean
+kilobytes (* 1024) or megabytes (* 1024 * 1024).
+.SH OUTPUT
+Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e.,
+.sp
+.ft CB
+8.00 25.33
+.ft
+.SH "MEMORY UTILIZATION"
+This benchmark should move approximately the reported amount of memory.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/cache.8 b/performance/lmbench3/doc/cache.8
new file mode 100644
index 0000000..15bdeb8
--- /dev/null
+++ b/performance/lmbench3/doc/cache.8
@@ -0,0 +1,49 @@
+.\" $Id$
+.TH CACHE 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+cache \- cache parameters
+.SH SYNOPSIS
+.B cache
+[
+.I "-L <line size>"
+]
+[
+.I "-M <len>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B cache
+tries to determine the characteristics of the memory hierarchy.  It
+attempts to determine the number of caches, the size of each cache,
+the line size for each cache, and the available memory parallelism at
+each level in the memory hierarchy.
+The largest amount of memory it will examine is 
+.I len
+bytes.  
+.LP
+.B cache
+first attempts to determine the number and size of caches by measuring
+the memory latency for various memory sizes.  Once it has identified
+the various caches it then measures the latency, parallelism, and line
+size for each cache.  Unfortunately, determining the cache size merely
+from latency is exceedingly difficult due to variations in cache
+replacement and prefetching strategies.
+.SH BUGS
+.B cache
+is an experimental benchmark and is known to fail on many processors.
+In particular there are a large number of machines with weird caching
+behavior that confuse
+.B cache
+and prevent it from accurately determining the number and size of the
+various caches.
+.SH "SEE ALSO"
+lmbench(8), line(8), tlb(8), par_mem(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/ctx.pic b/performance/lmbench3/doc/ctx.pic
new file mode 100644
index 0000000..8e55781
--- /dev/null
+++ b/performance/lmbench3/doc/ctx.pic
@@ -0,0 +1,198 @@
+.sp .10i
+.in +.07i
+.PS
+.ps 9
+.vs 9
+.ft CB
+[
+# Variables, tweak these.
+	xtick = 2.000000			# width of an X tick
+	xlower = 0.000000			# where the xtick start
+	xupper = 22.000000			# upper range of graph
+	xn = 11					# number of ticks to do
+	ytick = 50.000000			# width of an Y tick
+	ylower = 0.000000			# where the ytick start
+	yupper = 450.000000			# upper range of graph
+	yn = 9					# number of ticks to do
+	xsize = 2.05				# width of the graph
+	ysize = 2.1				# height of the graph
+	yscale = ysize / (yupper - ylower)	# scale data to paper
+	xscale = xsize / (xupper - xlower)	# scale data to paper
+	tick = 0.10000000000000001				# distance towards numbers
+	gthk = .1				# thickness of grid lines
+	thk = 0.75				# thickness of data lines
+	qthk = 2.0				# thickness of quartile lines
+	vs = .10				# works for 10 point fonts
+
+# Draw the graph borders and tick marks
+	O:	box  thick 1.5 ht ysize wid xsize
+	j = ylower
+	t = tick * .5
+	for i = 0 to yn by 1 do {
+		ys = j - ylower
+		g = ys * yscale
+		line thick 1.5 from O.sw + (-tick, g) to O.sw + (0, g)
+		
+		if (i < yn) then {
+			y2 = (ys + (ytick / 2)) * yscale
+			line thick .5 from O.sw + (-t, y2) to O.sw + (0, y2)
+		}
+		if (yupper - ylower > 999) then {
+			sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02)
+		} else { if (yupper - ylower > 10) then {
+			sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02)
+		} else { if (yupper - ylower > 1) then {
+			sprintf("%.1f", j) rjust at O.sw + (-.2, g - .02)
+		} else {
+			sprintf("%.2f", j) rjust at O.sw + (-.2, g - .02)
+		}}}
+		j = j + ytick
+	}
+	j = xlower
+	for i = 0 to xn by 1 do {
+		xs = j - xlower
+		g = xs * xscale
+		line thick 1.5 from O.sw + (g, -tick) to O.sw + (g, 0)
+		
+		if (i < xn) then {
+			x2 = (xs + (xtick / 2)) * xscale
+			line thick .5 from O.sw + (x2, 0) to O.sw + (x2, -t)
+		}
+		if (xupper - xlower > 999) then {
+			sprintf("%.0f", j) at O.sw + (g, -.25)
+		} else { if (xupper - xlower > 10) then {
+			sprintf("%.0f", j) at O.sw + (g, -.25)
+		} else { if (xupper - xlower > 1) then {
+			sprintf("%.1f", j) at O.sw + (g, -.25)
+		} else {
+			sprintf("%.2f", j) at O.sw + (g, -.25)
+		}}}
+		j = j + xtick
+	}
+
+# DATASET: Process size=0	overhead=10, MARK 0
+[ "\(ci" ] at O.sw + \
+	(xscale * (2 - xlower), yscale * (6 - ylower))
+[ "\(ci" ] at O.sw + \
+	(xscale * (4 - xlower), yscale * (7 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(ci" ] at O.sw + \
+	(xscale * (8 - xlower), yscale * (7 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(ci" ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (8 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(ci" ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (8 - ylower))
+line thick thk from 2nd last [].c to last [].c
+
+# DATASET: Process size=4	overhead=19, MARK 1
+[ "\(sq" ] at O.sw + \
+	(xscale * (2 - xlower), yscale * (7 - ylower))
+[ "\(sq" ] at O.sw + \
+	(xscale * (4 - xlower), yscale * (8 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(sq" ] at O.sw + \
+	(xscale * (8 - xlower), yscale * (9 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(sq" ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(sq" ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (12 - ylower))
+line thick thk from 2nd last [].c to last [].c
+
+# DATASET: Process size=16	overhead=66, MARK 2
+[ "\(*D" ] at O.sw + \
+	(xscale * (2 - xlower), yscale * (14 - ylower))
+[ "\(*D" ] at O.sw + \
+	(xscale * (4 - xlower), yscale * (15 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(*D" ] at O.sw + \
+	(xscale * (8 - xlower), yscale * (18 - ylower))
+".12M" at O.sw + \
+	(xscale * (8 - xlower), .12 + yscale * (18 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(*D" ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (46 - ylower))
+".25M" at O.sw + \
+	(xscale * (16 - xlower), .12 + yscale * (46 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(*D" ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (88 - ylower))
+line thick thk from 2nd last [].c to last [].c
+
+# DATASET: Process size=32	overhead=129, MARK 3
+[ "\(mu" ] at O.sw + \
+	(xscale * (2 - xlower), yscale * (22 - ylower))
+[ "\(mu" ] at O.sw + \
+	(xscale * (4 - xlower), yscale * (24 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(mu" ] at O.sw + \
+	(xscale * (8 - xlower), yscale * (107 - ylower))
+".25M" at O.sw + \
+	(xscale * (8 - xlower), .12 + yscale * (107 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(mu" ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (187 - ylower))
+".5M" at O.sw + \
+	(xscale * (16 - xlower), .12 + yscale * (187 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(mu" ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (188 - ylower))
+line thick thk from 2nd last [].c to last [].c
+
+# DATASET: Process size=64	overhead=255, MARK 4
+[ "\s+4\(bu\s0" ] at O.sw + \
+	(xscale * (2 - xlower), yscale * (38 - ylower))
+".12M" at O.sw + \
+	(xscale * (2 - xlower), .12 + yscale * (38 - ylower))
+[ "\s+4\(bu\s0" ] at O.sw + \
+	(xscale * (4 - xlower), yscale * (140 - ylower))
+".25M" at O.sw + \
+	(xscale * (4 - xlower) - .14, .12 + yscale * (140 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\s+4\(bu\s0" ] at O.sw + \
+	(xscale * (8 - xlower), yscale * (363 - ylower))
+".5M" at O.sw + \
+	(xscale * (8 - xlower), .12 + yscale * (363 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\s+4\(bu\s0" ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (367 - ylower))
+"1M" at O.sw + \
+	(xscale * (16 - xlower), .12 + yscale * (367 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\s+4\(bu\s0" ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (367 - ylower))
+line thick thk from 2nd last [].c to last [].c
+
+# Xaxis title.
+"\s+1Processes\s0" rjust at O.se - (-.15, .6)
+
+# Yaxis title (Time in microseconds)
+.ps +1
+"T" "i" "m" "e" " " "i" "n" at O.w - (.85, 0)
+"m" "i" "c" "r" "o" "s" "e" "c" "o" "n" "d" "s"  at O.w - (.68, 0)
+.ps
+
+# Graph title.
+.vs 12
+"\s+2Context switches for" "Linux i686@167Mhz\s0" at O.n + (-.5, .4)
+.vs
+
+# Title.
+[ "\(ci" ] at O.sw - (.80, .50 + 0 * vs)
+"size=0KB \ overhead=10" ljust at last [].e + (.1, 0)
+[ "\(sq" ] at last [] - (0, vs)
+"size=4KB \ overhead=19" ljust at last [].e + (.1, 0)
+[ "\(*D" ] at last [] - (0, vs)
+"size=16KB overhead=66" ljust at last [].e + (.1, 0)
+[ "\(mu" ] at last [] - (0, vs)
+"size=32KB overhead=129" ljust at last [].e + (.1, 0)
+[ "\s+4\(bu\s0" ] at last [] - (0, vs)
+"size=64KB overhead=255" ljust at last [].e + (.1, 0)
+]
+.ft
+.ps
+.in
+.PE
diff --git a/performance/lmbench3/doc/ctx.tbl b/performance/lmbench3/doc/ctx.tbl
new file mode 100644
index 0000000..b3fdb1a
--- /dev/null
+++ b/performance/lmbench3/doc/ctx.tbl
@@ -0,0 +1,63 @@
+.KS
+.TS
+expand doublebox;
+c|c s|c s
+l|c c|c c
+l|r r|r r.
+	2 processes	8 processes
+System	\fB0KB\fP	32KB	0KB	32KB
+=
+Linux alpha	10	17	13	41\ 
+Linux i486	11	394	18	594\ 
+Linux alpha	11	73	13	92\ 
+Linux i486	-1	70	-1	78\ 
+Linux i586	10	163	13	215\ 
+DEC Alpha	25	18	42	21\ 
+SunOS-5.4 sun4m	37	128	52	73\ 
+DEC Alpha	39	55	46	112\ 
+DEC Alpha	53	50	56	62\ 
+DEC Alpha	53	66	59	93\ 
+DEC Alpha	59	68	115	134\ 
+DEC Alpha	14	27	22	159\ 
+DEC Alpha	40	42	46	205\ 
+Sun Ultra1	14	27	20	73\ 
+Unixware/i686	21		22	\ 
+DEC Alpha	43	142	45	197\ 
+SunOS-5.4 sun4m	54	65	85	102\ 
+SunOS-5.4 sun4m	75	31	110	102\ 
+IBM Power2	13	16	18	43\ 
+HP-UX 9000/819	13	41	15	109\ 
+HP-UX 9000/755	25	29	29	220\ 
+HP-UX 9000/735	29	39	31	204\ 
+HP-UX 9000/735	29	42	34	205\ 
+HP-UX 9000/735	29	32	30	164\ 
+Linux i586	36	163	47	222\ 
+Linux i686	6	22	7	107\ 
+Linux i586	13	178	20	273\ 
+Linux i586	13	182	21	232\ 
+Linux i586	16	218	22	266\ 
+Linux i586	66	240	83	347\ 
+Sun SC1000	107	135	104	362\ 
+SunOS-5.4 sun4d	137	245	164	486\ 
+SunOS-5.4 sun4d	224	113	245	134\ 
+FreeBSD/i586	28	67	34	158\ 
+IRIX5.3 IP22	40	47	38	104\ 
+IBM PowerPC	16	87	26	144\ 
+FreeBSD/i586	30	54	36	137\ 
+FreeBSD/i586	24	54	28	137\ 
+IRIX64 IP21	84	104	87	101\ 
+dgux mc88110	89	119	122	263\ 
+HP-UX 9000/897	20	39	23	111\ 
+HP-UX 9000/735	27	37	30	222\ 
+FreeBSD/i586	29	41	35	123\ 
+FreeBSD/i586	29	-13	36	78\ 
+IRIX IP22	38	50	42	74\ 
+IRIX64-601 IP26	72	92	74	93\ 
+IRIX64 IP19	59	68	79	91\ 
+IRIX64 IP25	55	77	59	85\ 
+IRIX64 IP19	63	80	69	93\ 
+IRIX IP19	141	150	96	115\ 
+HP-UX 9000/770	21	24	21	218\ 
+IRIX5.3 IP19	150	157	102	167\ 
+.TE
+.KE
diff --git a/performance/lmbench3/doc/description.ms b/performance/lmbench3/doc/description.ms
new file mode 100644
index 0000000..91d2b23
--- /dev/null
+++ b/performance/lmbench3/doc/description.ms
@@ -0,0 +1,531 @@
+.\" $X$ xroff -mgs $file
+.\" $tty$ groff -mgs $file | colcrt - | more
+.\" $lpr$ groff -mgs $file > ${file}.lpr
+.\" Define a page top that looks cool
+.de PT
+.if \\n%>1 \{\
+.	sp -.1i
+.	ps 14
+.	ft 3
+.	nr big 24
+.	nr space \\w'XXX'
+.	nr titlewid \\w'\\*[title]'
+.	nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2
+.	ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25'
+.	ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0
+.	ce 1
+\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar]
+.	ps
+.	sp -.70
+.	ps 12
+\\l'\\n[LL]u'
+.	ft
+.	ps
+.\}
+..
+.\" Define a page bottom that looks cool
+.de BT
+.	ps 9
+\v'-1'\\l'\\n(LLu'
+.	sp -1
+.	tl '\(co 1994 \\*[author]'\\*(DY'%'
+.	ps
+..
+.\" Configuration
+.VARPS
+.nr HM 1.0i
+.nr FM 1i
+.if t .nr PO .75i
+.if t .nr LL 7.0i
+.if n .nr PO .25i
+.if n .nr LL 7.5i
+.nr PS 11
+.nr VS \n(PS+2
+.ds title Portable Tools for Performance Analysis
+.ds author Larry McVoy
+.TL
+lmbench:
+.sp .5
+\*[title]
+.br
+\s8Revision $Revision: 1.4 $ of $Date: 94/11/23 18:02:12-08:00 $\s0
+.AU
+\*[author]
+.AI
+.ps -2
+lm@xxxxxxx\**
+(415) 390-1804
+.ps +2
+.AB
+A description of a set benchmarks for measuring system performance.
+The benchmarks include latency measurements of basic system operations
+such as memory, processes, networking, and disks, and bandwidth measurements
+of memory, disks, and networking.
+The benchmarks have been run under a wide variety of Unix systems.
+The benchmarks are freely distributed under
+the GNU General Public License, with the additional restriction 
+that results may be reported only if the benchmarks are unmodified.
+.AE
+.sp 2
+.if t .2C
+.FS
+This work was mostly done while the author was an employee of Sun Microsystems
+Computer Corporation.
+.FE
+.NH 1
+Introduction
+.LP
+The purpose of this project is to provide the computer community with tools
+for performance analysis of basic operations of their computer systems.
+The tools are designed
+to be both portable and comparable over a wide set of Unix systems.\**
+.FS
+The tools have been run on 
+AIX,
+BSDI,
+HP-UX,
+IRIX,
+Linux,
+NetBSD,
+OSF/1,
+Solaris,
+and
+SunOS by the author.
+.FE
+The interfaces that the tools use have been carefully chosen to be as portable 
+and standard as possible.  It is an explicit intent of the benchmark to measure
+standard interfaces.  Users of this benchmark may not report results from
+modified versions of the benchmarks.\**
+.FS
+For example, the context switch benchmark may not use a \f(CWyield()\fP
+primitive instead of pipes; the networking benchmarks must use the socket
+interfaces, not TLI or some other interface.
+.FE
+.PP
+The purpose of
+this document is to describe each of the benchmarks.  
+.PP
+The benchmarks are loosely divided into latency, bandwidth, and ``other''
+categories.
+.NH 1
+Latency measurements
+.LP
+The latency measurements included in this suite are process creation times
+(including address space extension via mmap()),
+basic operating system entry cost, context switching, inter process
+communication, file system latency, 
+disk latency (you must be the super user to get
+disk latency results), and memory latency.
+.PP
+Process benchmarks are used to measure the basic process primitives,
+such as creating a new process, running a different program, and context 
+switching.  Process creation benchmarks are of particular interest
+to distributed systems since many remote operations include the creation
+of a remote process to shepherd the remote operation to completion.
+Context switching is important for the same reasons.
+.PP
+Inter process communication latency is important because many operations
+are control messages that tell another process (frequently on another
+system) to do something.  The latency of telling the remote process to
+do something is pure overhead and is frequently in the critical path
+of important functions, such as distributed databases.\**
+.FS
+The performance of the TCP latency benchmark has proven to be a good
+estimate of the performance of the Oracle database lock manager.
+.FE
+.PP
+The inter process communication latency benchmarks are roughly the same
+idea: pass a small message (a byte or so) back and forth between two
+processes.  The reported results are always the microseconds it takes
+to do one round trip.  If you are interested in a one way timing, then
+about half the round trip is right (however, the CPU cycles tend to be
+somewhat asymmetric for a one trip).  
+.NH 2
+Process forks/exits
+.LP
+Create a child process which does nothing but 
+terminate.  Results are reported in creations per second.  
+The benchmark is measuring how fast the OS can create a new address
+space and process context.
+The child process is spawned via the \f(CBfork\fP() interface,
+not the \f(CBvfork\fP() interface. 
+.NH 2
+Simple process creates I
+.LP
+Create a child process which then runs a new program that does nothing
+but print ``hello world'' and exit.  The difference between this 
+benchmark and the previous is the running of a new program.  The
+time difference between this and the previous benchmark is the cost
+of starting a new (simple) program.  That cost is especially noticeable
+on (some) systems that have shared libraries.  Shared libraries can
+introduce a substantial (10s of milliseconds) start up cost.  This 
+benchmark is intended to quantify the time/space tradeoff of shared
+libraries.
+.NH 2
+Simple process creates II
+.LP
+Create a child process which runs the same new program except that the
+program is started by the system shell.  This is a clone of the C
+library \f(CBsystem\fP() interface.  The intent is to educate users
+about the cost of this interface.  I have long felt that using the
+Bourne shell, especially a dynamically linked Bourne shell, to start up
+processes is over kill; perhaps these numbers will convince others of the
+same thing.  A better choice would be Plan 9's \f(CBrc\fP shell (which
+is, by the way, free software).
+.NH 2
+Memory mapping 
+.LP
+Memory mapping is the process of making a file part of a process' address
+space, allowing direct access to the file's pages.  It is an alternative
+to the traditional read and write interfaces.  Memory mapping is extensively
+used for linking in shared libraries at run time.  This benchmark measures
+the speed at which mappings can be created as well as removed.  Results
+are reported in mappings per second, and the results can be graphed as the
+test is run over a series of different sizes.
+.NH 2
+Context switches
+.LP
+Measures process context switch time.\**  A context switch is defined as 
+the time it takes to save the state of one process and restore the state
+of another process.
+Typical context switch benchmarks measure just the minimal context switch
+time, i.e., the time to switch between two processes that are doing nothing
+but context switching.  That approach is misleading because systems may
+have multiple active processes and the processes typically have more state
+(hot cache lines) than just the code required to force another context
+switch.  This benchmark takes that into consideration and varies both
+the number and the size of the processes.
+.FS
+A previous version of this benchmark included several system calls
+in addition to the context switch, resulting in grossly over inflated
+context switch times.
+.FE
+.PP
+The benchmark is a ring of two to twenty processes that are connected
+with Unix pipes.  A token is passed from process to process, forcing
+context switches.  The benchmark measures the time it takes to pass
+the token two thousand times from process to process.  Each hand off
+of the token has two costs: (a) the context switch, and (b) the cost 
+of passing the token.  In order to get just the context switching time,
+the benchmark first measures the cost of passing the token through a 
+ring of pipes in a single process.  This time is defined as the cost 
+of passing the token and is not included in the reported context switch
+time.
+.PP
+When the processes are larger than the default baseline of ``zero''
+(where zero means just big enough to do the benchmark), the cost
+of the context switch includes the cost of restoring user level
+state (cache lines).  This is accomplished by having the process
+allocate an array of data and sum it as a series of integers
+after receiving the token but before passing the token to the
+next process.  Note that the overhead mentioned above includes 
+the cost of accessing the data but because it is measured in 
+just one address space, the cost is typically the cost with hot
+caches.  So the context switch time does not include anything
+other than the context switch provided that all the processes
+fit in the cache.  If there are cache misses (as is common), the
+cost of the context switch includes the cost of those cache misses.
+.PP
+Results for an HP system running at 100 mhz are shown below.  
+This is a particularly nice system for this benchmark because the
+results are quite close to what is expected from a machine with a
+256KB cache.  As the size and number of processes are both increased,
+processes start falling out of the cache, resulting in higher context
+switch times.
+.LP
+.so ctx.pic
+.NH 2
+Null system calls
+.LP
+Measures the cost of entering and exiting (without pausing) the
+operating system.  This is accomplished by repeatedly writing one byte
+to \f(CB/dev/null\fP, a pseudo device driver that does nothing but
+discard the data.  Results are reported as system calls per second.
+.PP
+It is important to note that the system call chosen actually does the
+work on all systems, to the best of my knowledge.  There are some
+systems that optimized trivial system calls, such as \f(CBgetpid\fP(),
+to return the answer without a true entry into the OS proper.  Writing
+to \f(CB/dev/null\fP has not been optimized.
+.NH 2
+Pipe latency
+.LP
+This benchmark measures the OS; there is almost no code executed at
+user level.  The benchmark measures the round trip time of a small message
+being passed back and forth between two processes through a pair of
+Unix pipes.
+.NH 2
+TCP/IP latency
+.LP
+This benchmark measures the OS
+networking code and the driver code; there is almost no code executed at
+user level.  The benchmark measures the round trip time of a small message
+being passed back and forth between two processes through an AF_INET
+socket.  Note that both remote and local results may be reported.
+.NH 2
+UDP/IP latency
+.LP
+This benchmark measures the OS
+networking code and the driver code; there is almost no code executed at
+user level.  The benchmark measures the round trip time of a small message
+being passed back and forth between two processes through an AF_INET socket.
+Note that both remote
+and local results may be reported.
+.LP
+It is interesting to note that the TCP performance is sometimes
+greater than the UDP performance.  
+This is contrary to expectations since
+the TCP protocol is a reliable, connection oriented protocol, and as such
+is expected to carry more overhead.
+Why this is so is an exercise left to the
+reader.
+.NH 2
+RPC latency (TCP and UDP)
+.LP
+Actually two latency benchmarks: Sun RPC over TCP/IP and over UDP/IP.
+This benchmark consists of the user level RPC code layered over the TCP
+or UDP sockets.  The benchmark measures the round trip time of a small
+message being passed back and forth between two processes.  Note that
+both remote and local results may be reported.
+.LP
+Using the TCP or the UDP benchmarks as a baseline, it 
+is possible to see how much the RPC code is costing.  
+.NH 2
+TCP/IP connect latency
+.LP
+This benchmarks measures the time it takes to get a TCP/IP socket and
+connect it to a remote server.
+.NH 2
+File system latency
+.LP
+A benchmark that measures how fast the file system can do basic, common 
+operations, such as creates and deletes of small files.  
+.NH 2
+Page fault latency
+.LP
+A benchmark that measures how fast the file system can pagefault in a
+page that is not in memory.
+.NH 2
+Disk latency
+.LP
+A benchmark that is designed to measure the overhead of a disk
+operation.  Results are reported as operations per second.
+.PP
+The benchmark is designed with SCSI disks in mind.  It actually simulates
+a large number of disks in the following way.  The benchmark reads 512 byte
+chunks sequentially from the raw disk device (raw disks are unbuffered
+and are not read ahead by Unix).  The benchmark ``knows'' that most
+disks have read ahead buffers that read ahead the next 32-128 kilobytes.
+Furthermore, the benchmark ``knows'' that the disks rotate and read ahead
+faster than the processor can request the chunks of data.\**
+.FS
+This may not always be true - a processor could be fast enough to make the
+requests faster than the rotating disk.  If we take 3MB/sec to be disk
+speed, a fair speed, and divide that by 512, that is 6144 IOs/second, or
+163 microseconds per IO.  I don't know of any processor/OS/io controller
+combinations that can do an
+IO in 163 microseconds.
+.FE
+So the benchmark is basically reading small chunks of data from the
+disks track buffer.  Another way to look at this is that the benchmark 
+is doing memory to memory transfers across a SCSI channel.
+.PP
+No matter how you look at it, the resulting number represents a 
+\fBlower\fP bound on the overhead of a disk I/O.  In point of fact,
+the real numbers will be higher on SCSI systems.  Most SCSI controllers
+will not disconnect if the request can be satisfied immediately; that is
+the case here.  In practice, the overhead numbers will be higher because
+the processor will send the request, disconnect, get interrupted,
+reconnect, and transfer.
+.PP
+It is possible to generate loads of upwards of 500 IOPs on a single
+SCSI disk using this technique.  It is useful to do that to figure out
+how many drives could be supported on a system before there are no
+more processor cycles to handle the load.  Using this trick, you 
+do not have to hook up 30 drives, you simulate them.
+.NH 2
+Memory read latency
+.LP
+This is perhaps the most interesting benchmark in the suite.  The
+entire memory hierarchy is measured, including onboard cache latency
+and size, external cache latency and size, main memory latency, and TLB
+miss latency.  
+.PP
+The benchmark varies two parameters, array size and array stride.  
+For each size, a list of pointers is created for all of the different 
+strides.  Then the list is walked like so
+.DS
+.ft CB
+mov  r0,(r0)  # C code: p = *p;
+.DE
+The time to do about fifty thousand loads (the list wraps) is measured and
+reported.  The time reported is pure latency time and may be zero even though
+the load instruction does not execute in zero time.  Zero is defined as one
+clock cycle; in other words, the time reported is \fBonly\fP memory latency
+time, as it does not include the instruction execution time.  It is assumed
+that all processors can do a load instruction (not counting stalls) in one
+processor cycle.  In other words, if the processor cache load time
+is 60 nanoseconds on a 20 nanosecond processor, the load latency reported
+would be 40 nanoseconds, the missing 20 seconds is for the load instruction
+itself.  Processors that can manage to get the load address out to the 
+address pins before the end of the load cycle get some free time in this
+benchmark (I don't think any processors can do that).
+.PP
+Note that this benchmark has been validated by logic analyzer measurements
+on an SGI indy. The
+clever reader might realize that last few nanoseconds of inaccuracy could be
+rounded off by realizing that the latency is always going to be a multiple
+of the processor clock rate.
+.PP
+The raw data is a series of data sets.  Each data set is a stride size,
+with array size varied from about one kilobyte up to eight megabytes.
+When these data sets are all plotted together (using a log base 2 scale
+for the size variable), the data will be seen to contain a series of 
+horizontal plateaus.  The first is the onboard data cache latency (if there
+is an onboard cache).  The point where the lines start to go up marks the
+size of the cache.  The second is the external cache, the third is the
+main memory, and the last is main memory plus TLB miss cost.  In addition
+to this information, the cache line size can be derived by noticing which
+strides are faster than main memory times.  The first stride that is
+main memory speed is likely to be the cache line size.  The reason is
+that the strides that are faster than memory indicate that the benchmark is
+getting more than one hit per cache line.  Note that prefetching may confuse
+you.
+.PP
+The graph below shows a particularly nicely made machine, a DEC alpha.
+This machine is nice because (a) it shows the latencies and sizes of
+the on chip level 1 and motherboard level 2 caches, and (b) because it
+has the best all around numbers, especially considering it can support a
+4MB level 2 cache.  Nice work, DEC.
+.so mem.pic
+.NH 1
+Bandwidth measurements
+.LP
+One of my former managers\** once noted that ``Unix is Swahili for bcopy().''
+I believe that he was indicating his belief that the operating system spent
+most of its time moving data from one place to another, via various means.
+I tend to agree and have measured the various ways that data can be moved.
+The ways that are measured are: through pipes, TCP sockets, library bcopy()
+and hand unrolled bcopy(), the read() interface, through the mmap() interface,
+and direct memory read and write (no copying).
+.FS
+Ken Okin
+.FE
+.NH 2
+Pipe bandwidth
+.LP
+Bandwidth measurement between two local processes communicating through
+a Unix pipe.  Results are in megabytes per second.
+.NH 2
+TCP/IP socket bandwidth
+.LP
+Bandwidth measurement using TCP/IP sockets.  Results are reported in megabytes
+per second.  
+Results are reported for local, ethernet, FDDI, and ATM, where possible.  
+Results range from 1-10+ megabytes per second.  Any system delivering 
+more than 10 MB/second over TCP is doing very well by 1994 standards.
+.PP
+Note that for local measurements, the system is actually moving 
+twice as much data, since the data is being moved to/from the same host.
+.PP
+Local bandwidths are (sometimes) useful for determining the overhead of the
+protocol stack (as well as other OS tasks, such as context switching).  
+Note, however, that some implementations (such as Solaris 2.x) have 
+``fast pathed'' loopback IP which skews the results.  The fast path
+uses a larger MTU and does not do checksums.
+.PP
+The sockets are configured to use the largest receive/send buffers that the OS
+will allow.  This is done to allow maximum bandwidth.  Sun's 4.x TCP/IP
+subsystem (and probably BSD's as well) default to 4KB send/receive buffers,
+which is too small.  (It would be better if the OS noted that this was a
+high volume / high bandwidth connection and automatically grew the buffers.
+Hint, hint.)
+.NH 2
+bcopy bandwidths
+.LP
+A simple benchmark that measures how fast data can be copied.  A hand
+unrolled version and the C library version are tested.  Results are
+reported in megabytes per second.  Note that a typical system is actually
+moving about three times as much memory as the reported result.  A copy
+is actually a read, a write which causes a cache line read, and a write
+back.
+.NH 2
+Read bandwidth
+.LP
+Most VM system cache file pages for reuse.  This benchmark measures the
+speed at which those pages can be reused.  It is important to notice
+that this is not a disk read measurement, it is a memory read measurement.
+Results are reported in megabytes per second.
+.NH 2
+Mmap read bandwidth
+.LP
+The same measurement as the previous benchmark except that it maps the
+file, avoiding the copy from kernel to user buffer.  
+Results are reported in megabytes per second.
+.NH 2
+Memory read bandwidth
+.LP
+A large array is repeatedly read sequentially.
+Results reported in megabytes per second.
+.NH 2
+Memory write bandwidth
+.LP
+A large array is repeatedly written sequentially.
+Results reported in megabytes per second.
+.NH 1
+Other measurements
+.LP
+.NH 2
+Processor cycle time
+mhz
+.LP
+Calculates the megahertz and clock speed of the processor.  This is the
+standard loop in which a series of interlocked operations are timed,
+and then the megahertz is derived from the timing.  The operations 
+are purposefully interlocked to overcome any super scalerness of the
+system under test.
+.PP
+There are actually three versions of mhz, a generic one that works on
+most systems, and two specific versions for SuperSPARC and rs6000
+systems.
+.PP
+It turns out that the
+SuperSPARC processor has two ALU's that are run at twice the clock rate,
+allowing two interlocked operations to complete in one processor clock.\**
+.FS
+Credit and thanks to John Mashey of SGI/MIPS fame, who kindly took the
+time to out why the benchmark wasn't working on SuperSPARC
+systems.  He explained the SuperSPARC pipeline and the solution to the
+problem.
+.FE
+Fortunately, the ALU's are asymmetric and can not do two shifts in
+one processor clock.  Shifts are used on SuperSPARC systems.
+.PP
+IBM rs6000 systems have a C compiler that does not honor the
+``register'' directive in unoptimized code.  The IBM loop looks
+like it is doing half as many instructions as the others.  This
+is on purpose, each add on the IBM is actually two instructions
+(I think it is a load/add/store or something like that).
+.NH 1
+Acknowledgments
+.LP
+I would like to acknowledge Sun Microsystems for supporting the development
+of this project.  In particular,  my personal thanks to Paul Borrill,
+Director of the Architecture and Performance group, for conceiving and
+supporting the development of these benchmarks.
+.PP
+My thanks to John Mashey and Neal Nuckolls of Silicon Graphics for reviews,
+comments, and explanations of the more obscure problems.
+.PP
+My thanks to Satya Nishtala of Sun Microsystems for (a) listening to me
+complain about memory latencies over and over, (b) doing something about
+it in future SPARC systems, and (c) reviewing the memory latency results
+and explained IBM's sub blocking scheme (I still don't really understand
+it but he does.  Ask him).
+.NH 1
+Obtaining the benchmarks
+.LP
+The benchmarks will be posted to the Usenet comp.benchmarks group.  In addition,
+mail sent to \f(CBarchives@xxxxxxxxxxxxxxxxxxx\fP with a request for 
+\f(CBlmbench.shar\fP
+sources will get the latest and greatest.
diff --git a/performance/lmbench3/doc/graph.1 b/performance/lmbench3/doc/graph.1
new file mode 100644
index 0000000..64a5cb3
--- /dev/null
+++ b/performance/lmbench3/doc/graph.1
@@ -0,0 +1,143 @@
+.\" $Id: graph.1 1.2 94/12/27 17:50:18-08:00 lm@xxxxxxxxxxxxxxx $
+.de DS
+.	sp .5
+.	nf
+.	in +4
+.	ft CW
+.	vs -1
+..
+.de DE
+.	sp .5
+.	fi
+.	in
+.	ft
+.	vs
+..
+.TH GRAPH 1
+.SH NAME
+graph \- compile graphs into pic input
+.SH SYNOPSIS
+.B graph
+[ options ]
+[
+.I filename
+\&.\|.\|.
+]
+.SH DESCRIPTION
+.LP
+.B graph
+is a perl script which
+takes sets of X Y data and generates a (human readable) pic program
+that will produce the graphed data.  The output is designed such that
+you can save it in a file and tweak it to make it fit your document.
+Try one and look at the output.  The output is actually commented.
+.LP
+The graph is autosized and auto ticked.
+.LP
+The input data format is similar
+that of xgraph(1), i.e.,
+.DS
+1 1
+2 2
+3 3
+"sloped across
+
+1 4
+2 4
+3 4
+"straight across
+.DE
+.SH "CONTROL OPTIONS"
+.LP
+You may set the graph title, the X title, and the Y title with the 
+following control sequences in the data stream:
+.DS
+%T Graph title in +4 point font
+%X X axis title and/or units in +2 point font
+%Y Y axis title and/or units in +2 point font
+%fakemax-X <value>     force graph to be that big
+%fakemax-Y <value>     force graph to be that big
+%fakemin-X <value>     force graph to be that small
+%fakemin-Y <value>     force graph to be that small
+.DE
+.SH OPTIONS
+.IP -rev 12
+reverse X/Y data sense (and titles).  Note this is done after processing
+any fudging of the input data stream(s) (see -xk, -yk, -logx, etc below).
+.IP -below
+put data set titles below the graph rather than to the right.
+.IP -close
+no extra space around the data's endpoints.
+.IP -qline
+connect the quartile center points.
+.IP -grid
+dotted line grid marks.
+.IP -nobox
+no box around whole graph.
+.IP -big
+make the graph take the whole page.
+.IP -medium
+make the graph take about 1/2 the page.
+.IP -small
+make the graph be small.
+.IP -grapheach
+draw each data set in its own graph.
+.IP -nolabels
+no X/Y/Title labels.
+.IP -nodatal
+no data set labels.
+.IP -nomarks
+do not mark each data point with distinct markers (endpoints are still
+marked).
+.IP -k
+print values larger than 1000 as value/1000.
+.IP -xk
+multiply X input by 1024 (blech).
+.IP -yk
+multiply Y input by 1024 (blech).
+.IP -xm
+multiply X input by 1024*1024 (blech).
+.IP -ym
+multiply Y input by 1024*1024 (blech).
+.IP -logx
+convert X input into lag base 2 of X input.
+.IP -logy
+convert Y input into lag base 2 of Y input.
+.SH EXAMPLE
+Workstation price performance from a Digital ad.  Process with
+.DS
+.ps -2
+graph -rev workstations | groff -TX75
+
+"%T Workstation Price / Performance, 6/93
+"%X SPECINT 92 Performance
+"%Y Price in $1000's
+35 5
+65 10
+78 15
+110 70
+"Dec AXP line
+
+25 4
+25 8
+38 16
+48 21
+52 23
+64 27
+"Sun SPARC line
+.DE
+.ps
+.SH "QUARTILE FORMAT"
+Data points are \f(CBx y1 y2 y3 y4 y5\fP.   You get a two lines from the
+first two y values, a mark at the third, and another line from the last two.
+.SH "SEE ALSO"
+.BR gtroff (1),
+.BR gpic (1),
+.BR perl (1).
+.SH BUGS
+This should probably be called pic_graph or something like that.
+.LP
+This isn't done as much as I would like.
+It isn't integrated with the groff preprocessor yet.
+It doesn't know about .GS/.GE things.  I use it to manually generate
+a pic file and then include that.
diff --git a/performance/lmbench3/doc/lat_allmem.tbl b/performance/lmbench3/doc/lat_allmem.tbl
new file mode 100644
index 0000000..8594cb9
--- /dev/null
+++ b/performance/lmbench3/doc/lat_allmem.tbl
@@ -0,0 +1,62 @@
+.KS
+.TS
+expand doublebox;
+l c c c
+l c c c
+l r r r.
+	Level 1	Level 2	Main
+System	cache	cache	memory
+=
+Linux i586	8	103	151\ 
+DEC Alpha	12	67	291\ 
+Linux i586	8	107	150\ 
+DEC Alpha	10	56	321\ 
+Unixware/i686	14	34	196\ 
+DEC Alpha	9	51	288\ 
+DEC Alpha	7	47	458\ 
+DEC Alpha	12	57	468\ 
+SunOS-5.4 sun4m	13	--	180\ 
+SunOS-5.4 sun4m	20	--	291\ 
+SunOS-5.4 sun4m	16	115	816\ 
+Sun Ultra1	6	42	270\ 
+SunOS-5.4 sun4d	16	116	995\ 
+IBM Power2	--	13	141\ 
+IBM PowerPC	6	164	394\ 
+DEC Alpha	10	53	477\ 
+FreeBSD/i586	10	115	179\ 
+FreeBSD/i586	7	111	181\ 
+DEC Alpha	13	104	957\ 
+FreeBSD/i586	10	118	180\ 
+FreeBSD/i586	10	101	180\ 
+HP-UX 9000/735	--	10	347\ 
+Sun SC1000	20	140	1236\ 
+HP-UX 9000/770	--	9	376\ 
+SunOS-5.4 sun4d	24	173	1246\ 
+Linux i686	12	90	194\ 
+Linux i586	10	190	320\ 
+Linux i586	10	148	320\ 
+Linux i586	10	198	321\ 
+Linux i586	10	222	321\ 
+Linux i486	12	234	336\ 
+Linux alpha	3	83	354\ 
+Linux alpha	3	43	361\ 
+DEC Alpha	3	42	396\ 
+HP-UX 9000/735	--	10	348\ 
+IRIX5.3 IP22	10	76	1018\ 
+IRIX64 IP25	8	58	1134\ 
+HP-UX 9000/735	--	10	347\ 
+HP-UX 9000/897	--	11	424\ 
+HP-UX 9000/819	--	10	430\ 
+IRIX64 IP21	11	100	709\ 
+IRIX64 IP19	10	75	1150\ 
+IRIX IP19	8	64	1189\ 
+IRIX5.3 IP19	10	75	1149\ 
+IRIX64 IP19	10	70	1152\ 
+IRIX IP22	8	64	1170\ 
+FreeBSD/i586	10	106	181\ 
+HP-UX 9000/735	--	10	348\ 
+HP-UX 9000/755	--	10	393\ 
+dgux mc88110	22	319	753\ 
+IRIX64-601 IP26	13	120	1244\ 
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_allproc.tbl b/performance/lmbench3/doc/lat_allproc.tbl
new file mode 100644
index 0000000..d1aee27
--- /dev/null
+++ b/performance/lmbench3/doc/lat_allproc.tbl
@@ -0,0 +1,60 @@
+.KS
+.TS
+expand doublebox;
+l|c|c|c
+l|r|r|r.
+	fork	\fBfork, exec\fP	fork, exec
+System	& exit	\fB& exit\fP	sh -c & exit
+=
+DEC Alpha	4.6	13\ 	42\ 
+DEC Alpha	3.3	11\ 	44\ 
+Linux alpha	0.7	3\ 	12\ 
+Linux alpha	1.0	2\ 	16\ 
+DEC Alpha	2.0	6\ 	43\ 
+DEC Alpha	4.8	16\ 	64\ 
+Linux i686	0.5	5\ 	17\ 
+DEC Alpha	3.1	10\ 	281\ 
+Linux i586	0.9	5\ 	16\ 
+DEC Alpha	5.3	14\ 	27\ 
+DEC Alpha	5.1	15\ 	89\ 
+Sun Ultra1	3.7	20\ 	10\ 
+SunOS-5.4 sun4m	8.0	46\ 	237\ 
+SunOS-5.4 sun4m	18.0	83\ 	37\ 
+SunOS-5.4 sun4m	10.7	57\ 	87\ 
+Linux i486	3.3	10\ 	112\ 
+Linux i586	1.6	12\ 	44\ 
+SunOS-5.4 sun4d	13.7	75\ 	113\ 
+IBM Power2	1.2	8\ 	16\ 
+IBM PowerPC	2.9	8\ 	50\ 
+SunOS-5.4 sun4d	20.8	93\ 	136\ 
+HP-UX 9000/735	1.3	3\ 	17\ 
+IRIX5.3 IP19	4.3	8\ 	20\ 
+IRIX5.3 IP22	3.1	8\ 	19\ 
+IRIX64-601 IP26	4.6	24\ 	39\ 
+IRIX IP22	3.0	8\ 	22\ 
+Linux i586	2.4	9\ 	26\ 
+Linux i586	1.8	15\ 	30\ 
+Linux i586	1.9	15\ 	30\ 
+Linux i586	3.1	24\ 	73\ 
+DEC Alpha	13.4	33\ 	39\ 
+Sun SC1000	14.0	69\ 	175\ 
+FreeBSD/i586	2.9	14\ 	22\ 
+FreeBSD/i586	2.7	13\ 	21\ 
+IRIX64 IP21	4.2	14\ 	30\ 
+HP-UX 9000/770	3.1	9\ 	18\ 
+FreeBSD/i586	2.8	13\ 	22\ 
+HP-UX 9000/735	3.5	10\ 	20\ 
+HP-UX 9000/735	3.5	10\ 	19\ 
+IRIX64 IP19	4.5	19\ 	37\ 
+HP-UX 9000/819	4.2	67\ 	118\ 
+HP-UX 9000/755	3.6	10\ 	18\ 
+HP-UX 9000/897	6.7	15\ 	37\ 
+IRIX IP19	6.2	19\ 	46\ 
+HP-UX 9000/735	3.5	10\ 	20\ 
+FreeBSD/i586	2.7	12\ 	20\ 
+FreeBSD/i586	3.0	14\ 	23\ 
+IRIX64 IP25	3.3	12\ 	24\ 
+IRIX64 IP19	4.0	14\ 	24\ 
+dgux mc88110	8.8	13\ 	67\ 
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_connect.8 b/performance/lmbench3/doc/lat_connect.8
new file mode 100644
index 0000000..11a7912
--- /dev/null
+++ b/performance/lmbench3/doc/lat_connect.8
@@ -0,0 +1,47 @@
+.\" $Id: lat_connect.8 1.2 00/10/16 17:13:41+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_CONNECT 8 "$Date: 00/10/16 17:13:41+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_connect \- measure interprocess connection latency via TCP/IP
+.SH SYNOPSIS
+.B lat_connect
+.I -s
+.sp .5
+.B lat_connect
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I hostname
+.sp .5
+.B lat_connect
+.I "-S hostname"
+.SH DESCRIPTION
+.B lat_connect
+is a client/server program that measures interprocess
+connection latencies.   The benchmark times the creation and connection of
+an AF_INET (aka TCP/IP) socket to a remote server.  Care is take that the
+connection time does not include any other overhead, such as the
+\fIgethostbyname()\fP or remote port lookups since these add more overhead
+than the connection establishment itself.
+.LP
+.B lat_connect
+has three forms of usage: as a server (-s), as a client (lat_connect localhost),
+and as a shutdown (lat_connect -S localhost).
+.SH OUTPUT
+The reported time is in microseconds per connection.
+Output format is like so
+.sp
+.ft CB
+TCP/IP connection cost to localhost: 1006 microseconds
+.ft
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_connect.tbl b/performance/lmbench3/doc/lat_connect.tbl
new file mode 100644
index 0000000..5e6b4c2
--- /dev/null
+++ b/performance/lmbench3/doc/lat_connect.tbl
@@ -0,0 +1,44 @@
+.KS
+.TS
+center expand doublebox;
+l r.
+DEC Alpha	976
+Linux i586	606
+IRIX IP22	470
+SunOS-5.4 sun4d	852
+SunOS-5.4 sun4d	3123
+Sun SC1000	4594
+IRIX64-601 IP26	316
+Linux i586	1155
+IRIX5.3 IP22	349
+IRIX64 IP21	667
+IBM Power2	339
+dgux mc88110	4635
+DEC Alpha	4700
+HP-UX 9000/770	319
+HP-UX 9000/755	384
+HP-UX 9000/735	389
+IRIX64 IP25	716
+IRIX64 IP19	763
+IRIX5.3 IP19	694
+Linux i686	746
+Linux i586	775
+Linux i586	779
+Linux i586	835
+Linux i586	1348
+Linux i486	1439
+DEC Alpha	3047
+FreeBSD/i586	454
+HP-UX 9000/897	765
+FreeBSD/i586	465
+FreeBSD/i586	454
+FreeBSD/i586	397
+IRIX IP19	697
+HP-UX 9000/735	388
+IRIX64 IP19	805
+HP-UX 9000/735	459
+HP-UX 9000/819	585
+HP-UX 9000/735	740
+FreeBSD/i586	481
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_ctx.8 b/performance/lmbench3/doc/lat_ctx.8
new file mode 100644
index 0000000..f5a1c2b
--- /dev/null
+++ b/performance/lmbench3/doc/lat_ctx.8
@@ -0,0 +1,95 @@
+.\" $Id: lat_ctx.8 1.2 00/10/16 17:13:42+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_CTX 8 "$Date: 00/10/16 17:13:42+02:00 $" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+lat_ctx \- context switching benchmark
+.SH SYNOPSIS
+.B lat_ctx 
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+[
+.I "-s <size_in_kbytes>"
+]
+.I "#procs"
+[
+.I "#procs ..."
+]
+.SH DESCRIPTION
+.B lat_ctx
+measures context switching time for any reasonable
+number of processes of any reasonable size.
+The processes are connected in a ring of Unix pipes.  Each process
+reads a token from its pipe, possibly does some work, and then writes
+the token to the next process.
+.LP
+Processes may vary in number.  Smaller numbers of processes result in
+faster context switches.  More than 20 processes is not supported.
+.LP
+Processes may vary in size.  A size of zero is the baseline process that
+does nothing except pass the token on to the next process.  A process size
+of greater than zero means that the process does some work before passing
+on the token.  The work is simulated as the summing up of an array of the
+specified size.  The summing is an unrolled loop of about a 2.7 thousand
+instructions.  
+.LP
+The effect is that both the data and the instruction cache
+get polluted by some amount before the token is passed on.  The data 
+cache gets polluted by approximately the process ``size''.  The instruction
+cache gets polluted by a constant amount, approximately 2.7
+thousand instructions.  
+.LP
+The pollution of the caches results in larger context switching times for
+the larger processes.  This may be confusing because the benchmark takes
+pains to measure only the context switch time, not including the overhead
+of doing the work.  The subtle point is that the overhead is measured using
+hot caches.  As the number and size of the processes increases, the caches
+are more and more polluted until the set of processes do not fit.  The 
+context switch times go up because a context switch is defined as the switch 
+time
+plus the time it takes to restore all of the process state, including 
+cache state.  This means that the switch includes the time for the cache
+misses on larger processes.
+.SH OUTPUT
+Output format is intended as input to \fBxgraph\fP or some similar program.
+The format is multi line, the first line is a title that specifies the
+size and non-context switching overhead of the test.  Each subsequent 
+line is a pair of numbers that indicates the number of processes and 
+the cost of a context switch.  The overhead and the context switch times are
+in micro second units.  The numbers below are for a SPARCstation 2.
+.sp
+.ft CB
+.nf
+"size=0 ovr=179
+2 71
+4 104
+8 134
+16 333
+20 438
+.br
+.fi
+.ft
+.SH BUGS
+The numbers produced by this benchmark are somewhat inaccurate; they vary
+by about 10 to 15% from run to run.  A series of runs may be done and the
+lowest numbers reported.  The lower the number the more accurate the results.
+.LP
+The reasons for the inaccuracies are possibly interaction between the 
+VM system and the processor caches.  It is possible that sometimes the
+benchmark processes are laid out in memory such that there are fewer 
+TLB/cache conflicts than other times.  This is pure speculation on our part.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
+
diff --git a/performance/lmbench3/doc/lat_disk.tbl b/performance/lmbench3/doc/lat_disk.tbl
new file mode 100644
index 0000000..a39a6aa
--- /dev/null
+++ b/performance/lmbench3/doc/lat_disk.tbl
@@ -0,0 +1,23 @@
+.KS
+.TS
+center expand doublebox;
+l r.
+SunOS-5.4 sun4m	2876
+Sun SC1000	1466
+DEC Alpha	1436
+DEC Alpha	1995
+IRIX IP22	984
+Sun Ultra1	2242
+HP-UX 9000/770	732
+IRIX IP19	920
+IRIX5.3 IP22	1265
+IRIX5.3 IP19	991
+DEC Alpha	2057
+DEC Alpha	3729
+FreeBSD/i586	297
+FreeBSD/i586	306
+FreeBSD/i586	2314
+FreeBSD/i586	2284
+FreeBSD/i586	310
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_fcntl.8 b/performance/lmbench3/doc/lat_fcntl.8
new file mode 100644
index 0000000..cf3c93e
--- /dev/null
+++ b/performance/lmbench3/doc/lat_fcntl.8
@@ -0,0 +1,32 @@
+.\" $Id$
+.TH LAT_FCNTL 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+lat_fcntl \- fcntl file locking benchmark
+.SH SYNOPSIS
+.B lat_ctx 
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B lat_fcntl
+is a client/server program that measures file locking latencies.  The
+benchmark alternately locks and unlocks files so that only one of the
+client or server is running at a time, similar to ``hot potato''
+message passing benchmarks. 
+No other work is done in the processes.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8), lat_fifo(8), lat_tcp(8), lat_udp(8), lat_unix(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
+
diff --git a/performance/lmbench3/doc/lat_fifo.8 b/performance/lmbench3/doc/lat_fifo.8
new file mode 100644
index 0000000..65e5a08
--- /dev/null
+++ b/performance/lmbench3/doc/lat_fifo.8
@@ -0,0 +1,32 @@
+.\" $Id$
+.TH LAT_FIFO 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+lat_fifo \- FIFO benchmark
+.SH SYNOPSIS
+.B lat_ctx 
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B lat_fifo
+is a client/server program that measures interprocess
+communication latencies.  The benchmark passes a message back and forth between
+the two processes (this sort of benchmark is frequently referred to as a
+``hot potato'' benchmark).  No other work is done in the processes.
+The message is passed back and forth using FIFOs.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8), lat_fcntl(8), lat_tcp(8), lat_udp(8), lat_unix(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
+
diff --git a/performance/lmbench3/doc/lat_fs.8 b/performance/lmbench3/doc/lat_fs.8
new file mode 100644
index 0000000..51afc83
--- /dev/null
+++ b/performance/lmbench3/doc/lat_fs.8
@@ -0,0 +1,37 @@
+.\" $Id: lat_fs.8 1.4 94/11/25 16:33:19-08:00 lm@xxxxxxxxxxxxxxx $
+.TH LAT_FS 8 "$Date: 94/11/25 16:33:19-08:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_fs \- measure file system create/delete performance
+.SH SYNOPSIS
+.B lat_fs
+[
+.I dir
+]
+.SH DESCRIPTION
+.B lat_fs
+is a program that creates a number of small files in the current working
+directory and then removes the files.  Both the creation and removal of 
+the files is timed.
+.SH OPTIONS
+If
+.I dir
+is specified,
+.B lat_fs
+will change to that directory first and do the creates and deletes there.
+Otherwise the creates and deletes are done in $PWD.
+.SH OUTPUT
+The results are in terms of creates per second and deletes per second
+as a function of file size.  The output is in 4 column form and is the
+size of the file, the number created, the creations per second, and the
+removals per second.  Output format looks like:
+.sp
+.ft CB
+.nf
+0k      500     1304    2740
+1k      500     904     1663
+4k      500     861     1647
+10k     500     674     1516
+.fi
+.ft
+.SH "SEE ALSO"
+lmbench(8).
diff --git a/performance/lmbench3/doc/lat_fs.tbl b/performance/lmbench3/doc/lat_fs.tbl
new file mode 100644
index 0000000..b73a9d7
--- /dev/null
+++ b/performance/lmbench3/doc/lat_fs.tbl
@@ -0,0 +1,56 @@
+.KS
+.TS
+expand doublebox;
+l c c
+l r r.
+System	Create	\fBDelete\fP
+=
+Linux i586	1.4	0.1
+IRIX64-601 IP26	0.9	0.1
+Linux i586	1.5	0.1
+Linux i586	1.1	0.1
+Linux i586	1.4	0.1
+Linux i686	1.2	0.1
+SunOS-5.4 sun4d	0.7	0.4
+SunOS-5.4 sun4d	18.2	8.3
+Linux i586	1.4	0.1
+Linux i486	0.8	0.1
+Linux i486	0.8	0.1
+Linux i586	2.7	0.2
+Sun SC1000	3.7	1.3
+Linux alpha	4.3	4.2
+DEC Alpha	25.0	11.4
+DEC Alpha	25.0	11.1
+DEC Alpha	0.8	0.3
+DEC Alpha	1.3	0.5
+DEC Alpha	38.5	12.3
+DEC Alpha	33.3	11.9
+DEC Alpha	23.3	11.5
+IRIX64 IP25	3.5	4.0
+IRIX64 IP19	3.1	5.0
+IRIX IP22	13.3	8.4
+Linux alpha	25.0	11.5
+DEC Alpha	25.6	14.1
+dgux mc88110	2.4	0.5
+HP-UX 9000/735	2.8	3.9
+FreeBSD/i586	20.0	8.3
+FreeBSD/i586	20.4	8.3
+FreeBSD/i586	22.7	8.3
+FreeBSD/i586	22.7	8.3
+FreeBSD/i586	19.6	8.3
+IRIX IP19	12.0	11.8
+IRIX5.3 IP19	11.5	11.2
+IBM Power2	13.3	12.8
+IRIX5.3 IP22	9.4	8.5
+HP-UX 9000/735	28.6	11.5
+IRIX64 IP21	11.9	11.5
+IBM PowerPC	12.7	12.7
+HP-UX 9000/770	20.0	11.1
+HP-UX 9000/735	15.4	11.1
+HP-UX 9000/819	3.7	11.8
+HP-UX 9000/897	58.8	17.2
+HP-UX 9000/755	26.3	11.2
+IRIX64 IP19	12.5	9.8
+HP-UX 9000/735	26.3	12.0
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_http.8 b/performance/lmbench3/doc/lat_http.8
new file mode 100644
index 0000000..a4bb459
--- /dev/null
+++ b/performance/lmbench3/doc/lat_http.8
@@ -0,0 +1,41 @@
+.\" $Id$
+.TH LAT_FCNTL 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+lat_fcntl \- fcntl file locking benchmark
+.SH SYNOPSIS
+.B lat_ctx 
+[
+.I "-d"
+]
+[
+.I "-e"
+]
+[
+.I "-S"
+]
+.I serverhost
+[
+.I port
+]
+.SH DESCRIPTION
+.B lat_http
+is a client/server program that measures simple http transaction
+latencies.  It has its own HTTP server, and it is meant to simply
+measure the minimum overall costs of simple HTTP ``GET''
+transactions.  It does not measure the performance of third-party HTTP
+servers.  
+.LP
+The client simply makes a series of HTTP GET requests for files.  The
+files are a fixed set of files included with the benchmark.  No
+special care was made to ensure that the file sizes match and
+predetermined distribution.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8), lat_connect(8), lat_tcp(8), lat_sig(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
+
diff --git a/performance/lmbench3/doc/lat_ipc.tbl b/performance/lmbench3/doc/lat_ipc.tbl
new file mode 100644
index 0000000..d8a7069
--- /dev/null
+++ b/performance/lmbench3/doc/lat_ipc.tbl
@@ -0,0 +1,16 @@
+.KS
+.TS
+expand doublebox;
+l l c c c
+l l r r r.
+System	Network	\fBTCP bw\fP	TCP latency	UDP latency
+=
+IRIX IP21	hippi	62	1068	1099
+SunOS-5.5 sun4u@167	100baseT	9.5	280	308
+HP-UX 9000/735	fddi	8.8	425	441
+IRIX IP22	10baseT	.9	543	602
+IRIX IP21	10baseT	.9	1463	1376
+HP-UX 9000/735	10baseT	.9	592	603
+Linux	10baseT	.7	2954	1912
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_mem_rd.8 b/performance/lmbench3/doc/lat_mem_rd.8
new file mode 100644
index 0000000..5f8509f
--- /dev/null
+++ b/performance/lmbench3/doc/lat_mem_rd.8
@@ -0,0 +1,97 @@
+.\" $Id: lat_mem_rd.8 1.4 00/10/16 17:13:43+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_MEM_RD 8 "$Date: 00/10/16 17:13:43+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_mem_rd \- memory read latency benchmark
+.SH SYNOPSIS
+.B lat_mem_rd 
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I "size_in_megabytes"
+.I "stride"
+[
+.I "stride stride..."
+]
+.SH DESCRIPTION
+.B lat_mem_rd
+measures memory read latency for varying memory sizes and strides.  The
+results are reported in nanoseconds per load and have been verified
+accurate to within a few nanoseconds on an SGI Indy.
+.LP
+The
+entire memory hierarchy is measured, including onboard cache latency
+and size, external cache latency and size, main memory latency, and TLB
+miss latency.
+.LP
+Only data accesses are measured; the instruction cache is not measured.
+.LP
+The benchmark runs as two nested loops.  The outer loop is the stride size.
+The inner loop is the array size.  For each array size, the benchmark
+creates a ring of pointers that point backward one stride.  Traversing the
+array is done by
+.sp
+.ft CB
+	p = (char **)*p;
+.ft
+.sp
+in a for loop (the over head of the for loop is not significant; the loop is 
+an unrolled loop 100 loads long).  
+.LP
+The size of the array varies from 512 bytes to (typically) eight megabytes.
+For the small sizes, the cache will have an effect, and the loads will be
+much faster.  This becomes much more apparent when the data is plotted.
+.LP
+Since this benchmark uses fixed-stride offsets in the pointer chain,
+it may be vulnerable to smart, stride-sensitive cache prefetching
+policies.  Older machines were typically able to prefetch for
+sequential access patterns, and some were able to prefetch for strided
+forward access patterns, but only a few could prefetch for backward
+strided patterns.  These capabilities are becoming more widespread
+in newer processors.
+.SH OUTPUT
+Output format is intended as input to \fBxgraph\fP or some similar program
+(we use a perl script that produces pic input).
+There is a set of data produced for each stride.  The data set title
+is the stride size and the data points are the array size in megabytes 
+(floating point value) and the load latency over all points in that array.
+.SH "INTERPRETING THE OUTPUT"
+The output is best examined in a graph where you typically get a graph
+that has four plateaus.  The graph should plotted in log base 2 of the
+array size on the X axis and the latency on the Y axis.  Each stride
+is then plotted as a curve.  The plateaus that appear correspond to 
+the onboard cache (if present), external cache (if present), main
+memory latency, and TLB miss latency.
+.LP
+As a rough guide, you may be able to extract the latencies of the
+various parts as follows, but you should really look at the graphs,
+since these rules of thumb do not always work (some systems do not
+have onboard cache, for example).
+.IP "onboard cache" 16
+Try stride of 128 and array size of .00098.
+.IP "external cache" 
+Try stride of 128 and array size of .125.
+.IP "main memory"
+Try stride of 128 and array size of 8.
+.IP "TLB miss"
+Try the largest stride and the largest array.
+.SH BUGS
+This program is dependent on the correct operation of
+.BR mhz (8).
+If you are getting numbers that seem off, check that 
+.BR mhz (8)
+is giving you a clock rate that you believe.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8), tlb(8), cache(8), line(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_mmap.8 b/performance/lmbench3/doc/lat_mmap.8
new file mode 100644
index 0000000..b4a9f2f
--- /dev/null
+++ b/performance/lmbench3/doc/lat_mmap.8
@@ -0,0 +1,45 @@
+.\" $Id: lat_mmap.8 1.2 00/10/16 17:13:44+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_MMAP 8 "$Date: 00/10/16 17:13:44+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_mmap \- costs of mmapping and unmmapping varying file sizes
+.SH SYNOPSIS
+.B lat_mmap
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I size
+.I file
+.SH DESCRIPTION
+.B lat_mmap
+times how fast a mapping can be made and unmade.  This is useful because it
+is a fundemental part of processes that use SunOS style shared libraries
+(the libraries are mapped in at process start up time and unmapped at 
+process exit).
+.LP
+The benchmark maps in and unmaps the first \fIsize\fP bytes of the file
+repeatedly and reports the average time for one mapping/unmapping.  
+.LP
+The size
+specification may end with ``k'' or ``m'' to mean
+kilobytes (* 1024) or megabytes (* 1024 * 1024).
+.SH OUTPUT
+Output format is \f(CB"%0.2f %d\\n", megabytes, usecs\fP, i.e.,
+.sp
+.ft CB
+8.00 1200
+.ft
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_nullsys.tbl b/performance/lmbench3/doc/lat_nullsys.tbl
new file mode 100644
index 0000000..1adf112
--- /dev/null
+++ b/performance/lmbench3/doc/lat_nullsys.tbl
@@ -0,0 +1,58 @@
+.KS
+.TS
+center expand doublebox;
+l r.
+SunOS-5.4 sun4m	7
+Sun SC1000	9
+SunOS-5.4 sun4d	12
+SunOS-5.4 sun4m	9
+SunOS-5.4 sun4m	13
+Linux alpha	2
+Linux i586	2
+Linux i586	2
+Unixware/i686	5
+Sun Ultra1	5
+DEC Alpha	9
+Linux i586	3
+Linux i586	3
+Linux alpha	3
+DEC Alpha	11
+DEC Alpha	12
+DEC Alpha	15
+IBM PowerPC	12
+DEC Alpha	17
+FreeBSD/i586	7
+FreeBSD/i586	9
+FreeBSD/i586	10
+DEC Alpha	17
+FreeBSD/i586	7
+SunOS-5.4 sun4d	26
+Linux i686	4
+Linux i586	5
+Linux i586	5
+Linux i486	6
+Linux i486	6
+DEC Alpha	9
+DEC Alpha	13
+HP-UX 9000/735	12
+HP-UX 9000/735	13
+HP-UX 9000/735	14
+IRIX5.3 IP19	20
+HP-UX 9000/755	14
+HP-UX 9000/819	19
+IRIX64 IP25	23
+IRIX IP22	10
+IRIX IP19	16
+IRIX64 IP19	18
+IRIX64 IP19	24
+FreeBSD/i586	9
+HP-UX 9000/770	11
+HP-UX 9000/897	92
+HP-UX 9000/735	12
+dgux mc88110	75
+IBM Power2	16
+IRIX64-601 IP26	20
+IRIX64 IP21	25
+IRIX5.3 IP22	11
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_ops.8 b/performance/lmbench3/doc/lat_ops.8
new file mode 100644
index 0000000..87c6e8e
--- /dev/null
+++ b/performance/lmbench3/doc/lat_ops.8
@@ -0,0 +1,37 @@
+.\" $Id$
+.TH LAT_OPS 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+lat_ops \- basic CPU operation parallelism
+.SH SYNOPSIS
+.B lat_ops
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B lat_ops
+measures the latency of basic CPU operations, such as
+integer ADD.  
+.TP
+integer bit, add, mul, div, mod operations
+maximum parallelism for integer XOR, ADD, MUL, DIV, MOD operations.
+.TP
+uint64 bit, add, mul, div, mod operations
+maximum parallelism for uint64 XOR, ADD, MUL, DIV, MOD operations.
+.TP
+float add, mul, div operations
+maximum parallelism for flot ADD, MUL, DIV operations.
+.TP
+double add, mul, div operations
+maximum parallelism for flot ADD, MUL, DIV operations.
+.SH BUGS
+This benchmark is highly experimental and may sometimes (frequently?)
+give erroneous results.
+.SH "SEE ALSO"
+lmbench(8), par_ops(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_pagefault.8 b/performance/lmbench3/doc/lat_pagefault.8
new file mode 100644
index 0000000..e1cd958
--- /dev/null
+++ b/performance/lmbench3/doc/lat_pagefault.8
@@ -0,0 +1,46 @@
+.\" $Id: lat_pagefault.8 1.2 00/10/16 17:13:45+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_PAGEFAULT 8 "$Date: 00/10/16 17:13:45+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_pagefault \- measure the cost of pagefaulting pages from a file
+.SH SYNOPSIS
+.B lat_pagefault
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I file
+[
+.I file....
+]
+.SH DESCRIPTION
+.B lat_pagefault
+times how fast a page of a file can be faulted in.  The file is flushed from 
+(local) memory by using the \f(CBmsync()\fP interface with the invalidate
+flag set.  (Note that NFS does not send this over the wire so this makes
+for a handy way to measure the cost of going across the wire.)
+.LP
+The benchmark maps in the entire file and the access pages backwards using
+a stride of 256K kilobytes.
+.SH OUTPUT
+Output format is below; it prints the average cost of page faulting a page.
+.sp
+.ft CB
+Pagefaults on <file>: <d> usecs
+.ft
+.SH BUGS
+Using a stride of 256K may be a bad idea because SCSI controllers
+may have caches bigger than that.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_pipe.8 b/performance/lmbench3/doc/lat_pipe.8
new file mode 100644
index 0000000..1dff34e
--- /dev/null
+++ b/performance/lmbench3/doc/lat_pipe.8
@@ -0,0 +1,38 @@
+.\" $Id: lat_pipe.8 1.2 00/10/16 17:13:45+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_PIPE 8 "$Date: 00/10/16 17:13:45+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_pipe \- measure interprocess communication latency through pipes
+.SH SYNOPSIS
+.B lat_pipe
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B lat_pipe
+uses two processes communicating through a Unix pipe to measure interprocess
+communication latencies.  The benchmark passes a token back and forth between
+the two processes (this sort of benchmark is frequently referred to as a
+``hot potato'' benchmark).  No other work is done in the processes.
+.SH OUTPUT
+The reported time is in microseconds per round trip and includes the total
+time, i.e., the context switching overhead is includeded.
+Output format is like so
+.sp
+.ft CB
+Pipe latency: 491 microseconds
+.ft
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_pipe.tbl b/performance/lmbench3/doc/lat_pipe.tbl
new file mode 100644
index 0000000..5c34872
--- /dev/null
+++ b/performance/lmbench3/doc/lat_pipe.tbl
@@ -0,0 +1,58 @@
+.KS
+.TS
+center expand doublebox;
+l r.
+SunOS-5.4 sun4m	194
+SunOS-5.4 sun4m	150
+DEC Alpha	141
+Linux alpha	34
+Linux i486	56
+Linux i486	56
+Unixware/i686	86
+Linux i586	33
+Sun Ultra1	62
+SunOS-5.4 sun4m	372
+Linux alpha	34
+DEC Alpha	162
+DEC Alpha	191
+Linux i586	42
+DEC Alpha	71
+DEC Alpha	179
+Sun SC1000	278
+IBM PowerPC	65
+dgux mc88110	474
+SunOS-5.4 sun4d	519
+FreeBSD/i586	104
+FreeBSD/i586	111
+FreeBSD/i586	115
+SunOS-5.4 sun4d	671
+Linux i586	84
+Linux i686	31
+Linux i586	43
+Linux i586	43
+Linux i586	140
+DEC Alpha	185
+DEC Alpha	198
+DEC Alpha	278
+HP-UX 9000/755	193
+HP-UX 9000/897	118
+IRIX64 IP19	187
+HP-UX 9000/770	148
+HP-UX 9000/819	113
+HP-UX 9000/735	181
+FreeBSD/i586	115
+IRIX IP22	118
+HP-UX 9000/735	178
+HP-UX 9000/735	169
+HP-UX 9000/735	172
+IRIX64 IP21	264
+IRIX5.3 IP19	366
+IBM Power2	91
+IRIX64 IP25	230
+IRIX64-601 IP26	222
+IRIX64 IP19	251
+IRIX IP19	333
+FreeBSD/i586	127
+IRIX5.3 IP22	131
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_proc.8 b/performance/lmbench3/doc/lat_proc.8
new file mode 100644
index 0000000..51c8e69
--- /dev/null
+++ b/performance/lmbench3/doc/lat_proc.8
@@ -0,0 +1,58 @@
+.\" $Id: lat_proc.8 1.2 00/10/16 17:13:46+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_PROC 8 "$Date: 00/10/16 17:13:46+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_proc \- process creation tests
+.SH SYNOPSIS
+.B lat_proc
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I "procedure|fork|exec|shell"
+.SH DESCRIPTION
+.B lat_proc
+creates processes in three different forms, each more expensive than the last.
+The purposes is to measure the time that it takes to create a basic thread
+of control.
+.LP
+The forms are listed and described below:
+.TP 20
+Process fork+exit
+The time it takes to split a process into two (nearly) identical copies
+and have one exit.  This is how new processes are created but is not 
+very useful since both processes are doing the same thing.
+.TP
+Process fork+execve
+The time it takes to create a new process and have that new process run a new
+program.  This is the inner loop of all shells (command interpreters).
+.TP
+Process fork+/bin/sh -c
+The time it takes to create a new process and have that new process run a new
+program by asking the system shell to find that program and run it.  This is
+how the C library interface called \f(CBsystem\fP is implemented.  It is the
+most general and the most expensive.
+.SH OUTPUT
+Output is in microseconds per operation like so:
+.sp
+.ft CB
+.nf
+Process fork+exit: 6054 microseconds
+Process fork+execve: 11212 microseconds
+Process fork+/bin/sh -c: 44346 microseconds
+.br
+.fi
+.ft
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_rpc.8 b/performance/lmbench3/doc/lat_rpc.8
new file mode 100644
index 0000000..12680da
--- /dev/null
+++ b/performance/lmbench3/doc/lat_rpc.8
@@ -0,0 +1,68 @@
+.\" $Id: lat_rpc.8 1.2 00/10/16 17:13:47+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_RPC 8 "$Date: 00/10/16 17:13:47+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_rpc \- measure interprocess communication latency via Sun RPC
+.SH SYNOPSIS
+.B lat_rpc
+.I -s
+.sp .5
+.B lat_rpc
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+[
+.I "-p tcp|udp"
+]
+.I hostname
+[
+.I "udp|tcp"
+]
+.sp .5
+.B lat_rpc
+.I "-S hostname"
+.SH DESCRIPTION
+.B lat_rpc
+is a client/server program that measures interprocess
+communication latencies.  The benchmark passes a token back and forth between
+the two processes (this sort of benchmark is frequently referred to as a
+``hot potato'' benchmark).  No other work is done in the processes.
+.LP
+This benchmark may be compared to the TCP and UDP forms of the same benchmark
+to accurately see the cost of using RPC versus the cost of using plain 
+old TCP or UDP sockets.  It is worth noting that the RPC form is passing
+back and forth a single byte, not some long complicated record.
+.LP
+.B lat_rpc
+has three forms of usage: as a server (-s), as a client (lat_rpc localhost), and
+as a shutdown (lat_rpc -S localhost).
+.LP
+The client form may specify the protocol over which the RPCs are performed.
+The default is to measure performance for both
+.I udp
+and 
+.IR tcp .
+.SH OUTPUT
+The reported time is in microseconds per round trip and includes the total
+time, i.e., the context switching overhead is includeded.
+Output format is like so
+.sp
+.ft CB
+RPC/udp latency using localhost: 1344 microseconds
+.br
+RPC/tcp latency using localhost: 2089 microseconds
+.ft
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_select.8 b/performance/lmbench3/doc/lat_select.8
new file mode 100644
index 0000000..03f83bf
--- /dev/null
+++ b/performance/lmbench3/doc/lat_select.8
@@ -0,0 +1,33 @@
+.\" $Id$
+.TH LAT_SELECT 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+lat_select \- select benchmark
+.SH SYNOPSIS
+.B lat_ctx 
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+[
+.I "n"
+]
+.SH DESCRIPTION
+.B lat_select
+measures the time to do a select on 
+.I n
+file descriptors.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
+
diff --git a/performance/lmbench3/doc/lat_sig.8 b/performance/lmbench3/doc/lat_sig.8
new file mode 100644
index 0000000..91baf78
--- /dev/null
+++ b/performance/lmbench3/doc/lat_sig.8
@@ -0,0 +1,33 @@
+.\" $Id$
+.TH LAT_SIG 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+lat_sig \- select benchmark
+.SH SYNOPSIS
+.B lat_ctx 
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I "install|catch|prot"
+[
+.I "file"
+]
+.SH DESCRIPTION
+.B lat_sig
+measures the time to install and catch signals.  It can also measure
+the time to catch a protection fault.
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
+
diff --git a/performance/lmbench3/doc/lat_signal.tbl b/performance/lmbench3/doc/lat_signal.tbl
new file mode 100644
index 0000000..deac19b
--- /dev/null
+++ b/performance/lmbench3/doc/lat_signal.tbl
@@ -0,0 +1,48 @@
+.KS
+.TS
+expand doublebox;
+l c c
+l r r.
+System	sigaction	\fBsig handler\fP
+=
+DEC Alpha	20	30
+IRIX5.3 IP22	5	9
+IRIX IP22	10	12
+IRIX64-601 IP26	11	10
+Linux i586	11	22
+Linux i586	12	22
+DEC Alpha	5	101
+Linux alpha	13	38
+Linux i486	6	45
+Linux alpha	18	37
+Linux i586	9	25
+Linux i586	8	50
+dgux mc88110	5	16
+FreeBSD/i586	4	16
+FreeBSD/i586	10	34
+Linux i486	7	52
+FreeBSD/i586	9	34
+DEC Alpha	6	138
+IRIX64 IP19	6	9
+IRIX5.3 IP19	4	8
+IRIX64 IP21	5	13
+Linux i686	4	14
+Linux i586	4	23
+Linux i586	6	23
+HP-UX 9000/897	10	38
+IRIX64 IP19	4	35
+HP-UX 9000/770	10	37
+HP-UX 9000/819	11	54
+HP-UX 9000/755	10	52
+HP-UX 9000/735	10	38
+HP-UX 9000/735	6	32
+IRIX IP19	6	79
+HP-UX 9000/735	5	55
+IRIX64 IP25	5	55
+IBM PowerPC	5	19
+FreeBSD/i586	13	56
+IBM Power2	52	355
+HP-UX 9000/735	15	47
+FreeBSD/i586	18	52
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_syscall.8 b/performance/lmbench3/doc/lat_syscall.8
new file mode 100644
index 0000000..61b0ada
--- /dev/null
+++ b/performance/lmbench3/doc/lat_syscall.8
@@ -0,0 +1,70 @@
+.\" $Id: lat_syscall.8 1.2 00/10/16 17:13:48+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_SYSCALL 8 "$Date: 00/10/16 17:13:48+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_syscall - time simple entry into the operating system
+.SH SYNOPSIS
+.B lat_syscall
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I "null|read|write|stat|fstat|open"
+[
+.I file
+]
+.SH DESCRIPTION
+.TP
+null
+measures how long it takes to do 
+.IR getppid ().
+We chose
+.IR getppid ()
+because in all UNIX variants we are aware of, it requires a round-trip
+to/from kernel space and the actual work required inside the kernel is
+small and bounded.
+.TP
+read
+measures how long it takes to read one byte from \f(CB/dev/zero\fP.  
+Note that some operating systems do not support \f(CB/dev/zero\fP.  
+.TP
+write
+times how long it takes to write one byte to \f(CB/dev/null\fP.  This
+is useful as a lower bound cost on anything that has to interact with
+the operating system.
+.TP
+stat
+measures how long it takes to 
+.IR stat ()
+a file whose inode is already cached.
+.TP
+fstat
+measures how long it takes to 
+.IR fstat ()
+an open file whose inode is already cached.
+.TP
+open
+measures how long it takes to 
+.IR open ()
+and then
+.IR close()
+a file.
+.SH OUTPUT
+Output format is 
+.sp
+.ft CB
+Null syscall: 67 microseconds
+.ft
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_tcp.8 b/performance/lmbench3/doc/lat_tcp.8
new file mode 100644
index 0000000..c945460
--- /dev/null
+++ b/performance/lmbench3/doc/lat_tcp.8
@@ -0,0 +1,52 @@
+.\" $Id: lat_tcp.8 1.2 00/10/16 17:13:49+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_TCP 8 "$Date: 00/10/16 17:13:49+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_tcp \- measure interprocess communication latency via TCP/IP
+.SH SYNOPSIS
+.B lat_tcp
+.I -s
+.sp .5
+.B lat_tcp
+[
+.I "-m <message size>"
+]
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I hostname
+.sp .5
+.B lat_tcp
+.I "-S hostname"
+.SH DESCRIPTION
+.B lat_tcp
+is a client/server program that measures interprocess
+communication latencies.  The benchmark passes a message back and forth between
+the two processes (this sort of benchmark is frequently referred to as a
+``hot potato'' benchmark).  No other work is done in the processes.
+.LP
+.B lat_tcp
+has three forms of usage: as a server (-s), as a client (lat_tcp localhost), and
+as a shutdown (lat_tcp -S localhost).
+.SH OUTPUT
+The reported time is in microseconds per round trip and includes the total
+time, i.e., the context switching overhead is includeded.
+Output format is like so
+.sp
+.ft CB
+TCP latency using localhost: 700 microseconds
+.ft
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8), lat_fcntl(8), lat_fifo(8), lat_tcp(8), lat_udp(8), lat_unix(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_tcp.tbl b/performance/lmbench3/doc/lat_tcp.tbl
new file mode 100644
index 0000000..3ec3abb
--- /dev/null
+++ b/performance/lmbench3/doc/lat_tcp.tbl
@@ -0,0 +1,59 @@
+.KS
+.TS
+expand doublebox;
+l c c
+l r r.
+System	TCP	\fBRPC/TCP\fP
+=
+DEC Alpha	485	788
+DEC Alpha	581	822
+Linux alpha	419	617
+DEC Alpha	629	994
+DEC Alpha	428	851
+DEC Alpha	267	371
+DEC Alpha	526	872
+DEC Alpha	412	673
+Linux i686	263	427
+Sun SC1000	855	1386
+DEC Alpha	826	1451
+Sun Ultra1	162	346
+Linux alpha	429	602
+Linux i586	1149	1434
+SunOS-5.4 sun4m	560	1196
+SunOS-5.4 sun4d	1006	1584
+SunOS-5.4 sun4m	826	1631
+SunOS-5.4 sun4m	335	784
+SunOS-5.4 sun4d	1211	1847
+Linux i586	467	713
+Linux i486	1592	2147
+FreeBSD/i586	264	450
+FreeBSD/i586	297	510
+IRIX5.3 IP22	278	641
+IRIX64-601 IP26	467	1018
+IRIX IP22	279	580
+Linux i586	477	718
+Linux i586	1196	1506
+Linux i586	1291	1668
+Linux i486	1465	2078
+IBM PowerPC	299	698
+FreeBSD/i586	312	548
+HP-UX 9000/735	222	707
+FreeBSD/i586	290	532
+HP-UX 9000/770	186	712
+FreeBSD/i586	295	535
+HP-UX 9000/819	393	668
+HP-UX 9000/735	257	805
+HP-UX 9000/755	262	812
+HP-UX 9000/735	245	800
+HP-UX 9000/897	286	854
+dgux mc88110	1381	1851
+IBM Power2	332	649
+IRIX64 IP25	482	806
+IRIX IP19	766	913
+IRIX64 IP21	643	974
+IRIX64 IP19	886	957
+HP-UX 9000/735	248	820
+IRIX64 IP19	546	900
+IRIX5.3 IP19	815	1006
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_udp.8 b/performance/lmbench3/doc/lat_udp.8
new file mode 100644
index 0000000..1545e3f
--- /dev/null
+++ b/performance/lmbench3/doc/lat_udp.8
@@ -0,0 +1,52 @@
+.\" $Id: lat_udp.8 1.2 00/10/16 17:13:50+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LAT_UDP 8 "$Date: 00/10/16 17:13:50+02:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_udp \- measure interprocess communication latency via UDP/IP
+.SH SYNOPSIS
+.B lat_udp
+.I -s
+.sp .5
+.B lat_udp
+[
+.I "-m <message size>"
+]
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.I hostname
+.sp .5
+.B lat_udp
+.I "-S hostname"
+.SH DESCRIPTION
+.B lat_udp
+is a client/server program that measures interprocess
+communication latencies.  The benchmark passes a message back and forth between
+the two processes (this sort of benchmark is frequently referred to as a
+``hot potato'' benchmark).  No other work is done in the processes.
+.LP
+.B lat_udp
+has three forms of usage: as a server (-s), as a client (lat_udp localhost), and
+as a shutdown (lat_udp -S localhost).
+.SH OUTPUT
+The reported time is in microseconds per round trip and includes the total
+time, i.e., the context switching overhead is included.
+Output format is like so
+.sp
+.ft CB
+UDP latency using localhost: 650 microseconds
+.ft
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8), lat_fcntl(8), lat_fifo(8), lat_tcp(8), lat_unix(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_udp.tbl b/performance/lmbench3/doc/lat_udp.tbl
new file mode 100644
index 0000000..bf0ff9c
--- /dev/null
+++ b/performance/lmbench3/doc/lat_udp.tbl
@@ -0,0 +1,56 @@
+.KS
+.TS
+expand doublebox;
+l c c
+l r r.
+System	UDP	\fBRPC/UDP\fP
+=
+DEC Alpha	404	718
+Linux alpha	180	317
+Linux alpha	199	330
+DEC Alpha	259	358
+Linux i686	112	217
+Linux i486	368	770
+Linux i586	187	366
+Linux i586	276	538
+DEC Alpha	379	717
+DEC Alpha	676	765
+DEC Alpha	489	834
+Sun Ultra1	197	267
+Linux i586	281	552
+Linux i586	272	553
+SunOS-5.4 sun4m	414	622
+SunOS-5.4 sun4m	914	1290
+DEC Alpha	569	836
+Sun SC1000	739	1101
+SunOS-5.4 sun4m	590	935
+FreeBSD/i586	213	387
+FreeBSD/i586	249	408
+HP-UX 9000/819	413	655
+IRIX5.3 IP22	313	671
+IRIX64-601 IP26	474	1008
+IRIX IP22	261	562
+Linux i486	351	831
+DEC Alpha	709	1109
+SunOS-5.4 sun4d	1084	1430
+SunOS-5.4 sun4d	1180	1562
+IRIX IP19	796	903
+FreeBSD/i586	240	420
+IBM Power2	254	531
+IBM PowerPC	206	536
+FreeBSD/i586	265	459
+IRIX64 IP21	660	783
+dgux mc88110	1373	2175
+HP-UX 9000/897	289	673
+HP-UX 9000/770	185	657
+HP-UX 9000/735	244	742
+IRIX5.3 IP19	785	960
+IRIX64 IP25	486	740
+HP-UX 9000/735	248	759
+HP-UX 9000/735	246	768
+HP-UX 9000/735	252	786
+IRIX64 IP19	814	964
+HP-UX 9000/755	244	832
+IRIX64 IP19	678	893
+.TE
+.KE
diff --git a/performance/lmbench3/doc/lat_unix.8 b/performance/lmbench3/doc/lat_unix.8
new file mode 100644
index 0000000..2117b3f
--- /dev/null
+++ b/performance/lmbench3/doc/lat_unix.8
@@ -0,0 +1,41 @@
+.\" $Id$
+.TH LAT_UNIX 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+lat_unix \- measure interprocess communication latency via UNIX sockets
+.SH SYNOPSIS
+.B lat_unix
+[
+.I "-m <message size>"
+]
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B lat_unix
+is a client/server program that measures interprocess
+communication latencies.  The benchmark passes a message back and forth between
+the two processes (this sort of benchmark is frequently referred to as a
+``hot potato'' benchmark).  No other work is done in the processes.
+.SH OUTPUT
+The reported time is in microseconds per round trip and includes the total
+time, i.e., the context switching overhead is includeded.
+Output format is like so
+.sp
+.ft CB
+AF_UNIX sock stream latency: 700 microseconds
+.ft
+.SH ACKNOWLEDGEMENT
+Funding for the development of
+this tool was provided by Sun Microsystems Computer Corporation.
+.SH "SEE ALSO"
+lmbench(8), lat_fcntl(8), lat_fifo(8), lat_tcp(8), lat_udp(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lat_unix_connect.8 b/performance/lmbench3/doc/lat_unix_connect.8
new file mode 100644
index 0000000..b42e9a4
--- /dev/null
+++ b/performance/lmbench3/doc/lat_unix_connect.8
@@ -0,0 +1,43 @@
+.\" $Id$
+.TH LAT_UNIX_CONNECT 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lat_unix_connect \- measure interprocess connection latency via UNIX sockets
+.SH SYNOPSIS
+.B lat_unix_connect
+.I -s
+.sp .5
+.B lat_unix_connect
+[
+.I "-P <parallelism>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.sp .5
+.B lat_unix_connect
+.I "-S"
+.SH DESCRIPTION
+.B lat_unix_connect
+is a client/server program that measures interprocess
+connection latencies.   The benchmark times the creation and connection of
+an AF_UNIX socket to a local server.  
+.LP
+.B lat_connect
+has three forms of usage: as a server (-s), as a client (lat_connect),
+and as a shutdown (lat_connect -S).
+.SH OUTPUT
+The reported time is in microseconds per connection.
+Output format is like so
+.sp
+.ft CB
+UNIX connection cost: 1006 microseconds
+.ft
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/line.8 b/performance/lmbench3/doc/line.8
new file mode 100644
index 0000000..0e0e043
--- /dev/null
+++ b/performance/lmbench3/doc/line.8
@@ -0,0 +1,50 @@
+.\" $Id$
+.TH LINE 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+line \- cache line size
+.SH SYNOPSIS
+.B tlb
+[
+.I "-M <len>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B line
+tries to determine the cache line size in bytes of the largest cache
+which is smaller than
+.I len
+bytes.
+.LP
+.B line
+creates pointer chains which access the first word on each cache line
+on a page (randomly meandering through all the lines in a page before
+jumping to the next page).  It measures the average memory latency
+for a variety of line sizes, starting with a line size of one word.
+When it finds an increase in the average latency that is significantly
+larger than the latency for the smaller line size then it assumes that
+it has found the line size.
+.LP
+This algorithm works because for line sizes less than the true line
+size, at least two 
+.B line
+cache lines fit in the space of a true cache line.  Since that cache
+line will be accessed twice, the first access will cause an expensive
+cache miss, while the second access will be a cache hit.  Once the 
+.B line
+cache line is equal to the true cache line size, then all accesses
+will cause cache misses.
+.SH BUGS
+.B line
+is an experimental benchmark, but it seems to work well on most
+systems.  
+.SH "SEE ALSO"
+lmbench(8), tlb(8), cache(8), par_mem(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lmbench.3 b/performance/lmbench3/doc/lmbench.3
new file mode 100644
index 0000000..e6db877
--- /dev/null
+++ b/performance/lmbench3/doc/lmbench.3
@@ -0,0 +1,344 @@
+.\"
+.\" @(#)lmbench.man	3.0 2000/10/12
+.\"
+.\"   lmbench - benchmarking toolbox
+.\"
+.\"   Copyright (C) 1998-2000  Carl Staelin and Larry McVoy
+.\"   E-mail: staelin@xxxxxxxxxx
+.\"
+.TH "LMBENCH" 3 "$Date:$" "(c)1998-2000 Larry McVoy and Carl Staelin" "LMBENCH"
+.SH "NAME"
+lmbench \- benchmarking toolbox
+.SH "SYNOPSIS"
+.B "#include ``lmbench.h''"
+.LP
+.B "typedef u_long	iter_t"
+.LP
+.B "typedef (*benchmp_f)(iter_t iterations, void* cookie)"
+.LP
+.B "void	benchmp(benchmp_f initialize, benchmp_f benchmark, benchmp_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie)"
+.LP
+.B "uint64	get_n()"
+.LP
+.B "void	milli(char *s, uint64 n)"
+.LP
+.B "void	micro(char *s, uint64 n)"
+.LP
+.B "void	nano(char *s, uint64 n)"
+.lP
+.B "void	mb(uint64 bytes)"
+.LP
+.B "void	kb(uint64 bytes)"
+.SH "DESCRIPTION"
+Creating benchmarks using the 
+.I lmbench 
+timing harness is easy.
+Since it is so easy to measure performance using 
+.I lmbench , 
+it is possible to quickly answer questions that arise during system
+design, development, or tuning.  For example, image processing 
+.LP
+There are two attributes that are critical for performance, latency 
+and bandwidth, and 
+.I lmbench\'s 
+timing harness makes it easy to measure and report results for both.  
+Latency is usually important for frequently executed operations, and
+bandwidth is usually important when moving large chunks of data.
+.LP
+There are a number of factors to consider when building benchmarks.
+.LP
+The timing harness requires that the benchmarked operation
+be idempotent so that it can be repeated indefinitely.
+.LP
+The timing subsystem, 
+.BR benchmp ,
+is passed up to three function pointers.  Some benchmarks may
+need as few as one function pointer (for
+.IR benchmark ).
+.TP
+.B "void	benchmp(initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie)"
+measures the performance of 
+.I benchmark
+repeatedly and reports the median result.  
+.I benchmp
+creates
+.I parallel
+sub-processes which run
+.I benchmark
+in parallel.  This allows lmbench to measure the system's ability to
+scale as the number of client processes increases.  Each sub-process
+executes
+.I initialize
+before starting the benchmarking cycle with 
+.I iterations
+set to 0.  It will call
+.I initialize ,
+.I benchmark ,
+and
+.I cleanup
+with 
+.I iterations 
+set to the number of iterations in the timing loop 
+several times in order to collect
+.I repetitions
+results.  The calls to 
+.I benchmark
+are surrounded by 
+.I start
+and 
+.I stop
+call to time the amount of time it takes to do
+the benchmarked operation
+.I iterations
+times.
+After all the benchmark results have been collected, 
+.I cleanup
+is called with
+.I iterations set to 0 to cleanup any resources which 
+may have been allocated by 
+.I initialize
+or 
+.IR benchmark .
+.I cookie 
+is a void pointer to a hunk of memory that can be used to store any
+parameters or state that is needed by the benchmark.
+.TP
+.B "void	benchmp_getstate()"
+returns a void pointer to the lmbench-internal state used during 
+benchmarking.  The state is not to be used or accessed directly
+by clients, but rather would be passed into
+.I benchmp_interval. 
+.TP
+.B "iter_t	benchmp_interval(void* state)"
+returns the number of times the benchmark should execute its
+benchmark loop during this timing interval.  This is used only
+for weird benchmarks which cannot implement the benchmark
+body in a function which can return, such as the page fault
+handler.  Please see 
+.I lat_sig.c 
+for sample usage.
+.TP
+.B "uint64	get_n()"
+returns the number of times 
+.I loop_body
+was executed during the timing interval.
+.TP
+.B "void	milli(char *s, uint64 n)"
+print out the time per operation in milli-seconds.  
+.I n 
+is the number of operations during the timing interval, which is passed 
+as a parameter because each
+.I loop_body
+can contain several operations.
+.TP
+.B "void	micro(char *s, uint64 n)"
+print the time per opertaion in micro-seconds.
+.TP
+.B "void	nano(char *s, uint64 n)"
+print the time per operation in nano-seconds.
+.TP
+.B "void	mb(uint64 bytes)"
+print the bandwidth in megabytes per second.
+.TP
+.B "void	kb(uint64 bytes)"
+print the bandwidth in kilobytes per second.
+.SH "USING lmbench"
+Here is an example of a simple benchmark that measures the latency
+of the random number generator 
+.BR lrand48() :
+.IP
+.B "#include ``lmbench.h''"
+.br
+
+.br
+.B void
+.br
+.B benchmark_lrand48(iter_t iterations, void* cookie)
+.B {
+.br
+.B "	while(iterations-- > 0)"
+.br
+.B "		lrand48();"
+.br
+.B }
+.br
+
+.br
+.B int
+.br
+.B "main(int argc, char *argv[])"
+.br
+.B {
+.br
+.B "	benchmp(NULL, benchmark_lrand48, NULL, 0, 1, 0, TRIES, NULL);"
+.br
+.B "	micro("lrand48()", get_n());"
+.br
+.B "	exit(0);"
+.br
+.B }
+.br
+
+.LP
+Here is a simple benchmark that measures and reports the bandwidth of 
+.BR bcopy :
+.IP
+.B "#include ``lmbench.h''"
+.br
+
+.br
+.B "#define MB (1024 * 1024)
+.br
+.B "#define SIZE (8 * MB)"
+.br
+
+.br
+.B "struct _state {"
+.br
+.B "	int size;"
+.br
+.B "	char* a;"
+.br
+.B "	char* b;"
+.br
+.B "};"
+.br
+
+.br
+.B void
+.br
+.B initialize_bcopy(iter_t iterations, void* cookie)
+.B "{"
+.br
+.B "	struct _state* state = (struct _state*)cookie;"
+.br
+
+.br
+.B "    if (!iterations) return;"
+.br
+.B "	state->a = malloc(state->size);"
+.br
+.B "	state->b = malloc(state->size);"
+.br
+.B "	if (state->a == NULL || state->b == NULL)"
+.br
+.B "		exit(1);"
+.br
+.B "}"
+.br
+
+.br
+.B void
+.br
+.B benchmark_bcopy(iter_t iterations, void* cookie)
+.B "{"
+.br
+.B "	struct _state* state = (struct _state*)cookie;"
+.br
+
+.br
+.B "	while(iterations-- > 0)"
+.br
+.B "		bcopy(state->a, state->b, state->size);"
+.br
+.B "}"
+.br
+
+.br
+.B void
+.br
+.B cleanup_bcopy(iter_t iterations, void* cookie)
+.B "{"
+.br
+.B "	struct _state* state = (struct _state*)cookie;"
+.br
+
+.br
+.B "    if (!iterations) return;"
+.br
+.B "	free(state->a);"
+.br
+.B "	free(state->b);"
+.br
+.B "}"
+.br
+
+.br
+.B int
+.br
+.B "main(int argc, char *argv[])"
+.br
+.B "{"
+.br
+.B "	struct _state state;"
+.br
+
+.br
+.B "	state.size = SIZE;"
+.br
+.B "	benchmp(initialize_bcopy, benchmark_bcopy, cleanup_bcopy,"
+.br
+.B "		0, 1, 0, TRIES, &state);"
+.br
+.B "	mb(get_n() * state.size);"
+.br
+.B "	exit(0);"
+.br
+.B "}"
+.br
+
+.LP
+A slightly more complex version of the
+.B bcopy
+benchmark might measure bandwidth as a function of memory size and
+parallelism.  The main procedure in this case might look something
+like this:
+.IP
+.B int
+.br
+.B "main(int argc, char *argv[])"
+.br
+.B "{"
+.br
+.B "	int	size, par;"
+.br
+.B "	struct _state state;"
+.br
+
+.br
+.B "	for (size = 64; size <= SIZE; size <<= 1) {"
+.br
+.B "		for (par = 1; par < 32; par <<= 1) {"
+.br
+.B "			state.size = size;"
+.br
+.B "			benchmp(initialize_bcopy, benchmark_bcopy,"
+.br
+.B "				cleanup_bcopy, 0, par, 0, TRIES, &state);"
+.br
+.B "			fprintf(stderr, \%d\\t%d\\t\", size, par);"
+.br
+.B "			mb(par * get_n() * state.size);"
+.br
+.B "		}"
+.br
+.B "	}"
+.br
+.B "	exit(0);"
+.br
+.B "}"
+
+.SH "VARIABLES"
+There are three environment variables that can be used to modify the 
+.I lmbench
+timing subsystem: ENOUGH, TIMING_O, and LOOP_O.
+.SH "FUTURES"
+Development of 
+.I lmbench 
+is continuing.  
+.SH "SEE ALSO"
+lmbench(8), timing(3), reporting(3), results(3).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lmbench.8 b/performance/lmbench3/doc/lmbench.8
new file mode 100644
index 0000000..262515d
--- /dev/null
+++ b/performance/lmbench3/doc/lmbench.8
@@ -0,0 +1,222 @@
+.\" $Id: lmbench.8 1.4 00/10/16 17:13:52+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH LMBENCH 8 "$Date: 00/10/16 17:13:52+02:00 $" "(c)1994-2000 Larry McVoy and Carl Staelin" "LMBENCH"
+.SH NAME
+lmbench \- system benchmarks
+.SH DESCRIPTION
+.B lmbench
+is a series of micro benchmarks intended to measure basic operating
+system and hardware system metrics.  The benchmarks fall into three
+general classes: bandwidth, latency, and ``other''.
+.LP
+Most of the
+.I lmbench
+benchmarks use a standard timing harness described in timing(8)
+and have a few standard options:
+.IR parallelism ,
+.IR warmup ,
+and
+.IR repetitions .
+.I Parallelism
+specifies the number of benchmark processes to run in parallel.
+This is primarily useful when measuring the performance of SMP
+or distributed computers and can be used to evaluate the system's
+performance scalability.
+.I Warmup
+is the number of minimum number of microseconds the benchmark should
+execute the benchmarked capability before it begins measuring
+performance.  Again this is primarily useful for SMP or distributed
+systems and it is intended to give the process scheduler time to
+"settle" and migrate processes to other processors.  By measuring
+performance over various
+.I warmup
+periods, users may evaulate the scheduler's responsiveness.
+.I Repetitions
+is the number of measurements that the benchmark should take.  This
+allows lmbench to provide greater or lesser statistical strength to
+the results it reports.  The default number of 
+.I repetitions
+is 11.
+.SH BANDWIDTH MEASUREMENTS
+Data movement is fundemental to the performance on most computer systems.
+The bandwidth measurements are intended to show how the system can move
+data.  The results of the bandwidth metrics can be compared but care
+must be taken to understand what it is that is being compared.  The
+bandwidth benchmarks can be reduced to two main components: operating
+system overhead and memory speeds.  The bandwidth benchmarks report
+their results as megabytes moved per second but please note that the
+data moved is \fBnot\fP necessarily the same as the memory bandwidth 
+used to move the data.  Consult the individual man pages for more 
+information.
+.LP
+Each of the bandwidth benchmarks is listed below with a brief overview of the 
+intent of the benchmark.
+.TP 14
+bw_file_rd
+reading and summing of a file via the read(2) interface.
+.TP 
+bw_mem_cp
+memory copy.
+.TP
+bw_mem_rd
+memory reading and summing.
+.TP
+bw_mem_wr
+memory writing.
+.TP
+bw_mmap_rd
+reading and summing of a file via the memory mapping mmap(2) interface.
+.TP
+bw_pipe
+reading of data via a pipe.
+.TP
+bw_tcp
+reading of data via a TCP/IP socket.
+.TP
+bw_unix
+reading data from a UNIX socket.
+.SH LATENCY MEASUREMENTS
+Control messages are also fundemental to the performance on most
+computer systems.  The latency measurements are intended to show how fast
+a system can be told to do some operation.  The results of the 
+latency metrics can be compared to each other
+for the most part.  In particular, the
+pipe, rpc, tcp, and udp transactions are all identical benchmarks 
+carried out over different system abstractions.
+.LP
+Latency numbers here should mostly be in microseconds per operation.
+.TP 14
+lat_connect
+the time it takes to establish a TCP/IP connection.
+.TP 
+lat_ctx
+context switching; the number and size of processes is varied.
+.TP
+lat_fcntl
+fcntl file locking.
+.TP
+lat_fifo
+``hot potato'' transaction through a UNIX FIFO.
+.TP 
+lat_fs
+creating and deleting small files.
+.TP 
+lat_pagefault
+the time it takes to fault in a page from a file.
+.TP
+lat_mem_rd
+memory read latency (accurate to the ~2-5 nanosecond range,
+reported in nanoseconds).
+.TP
+lat_mmap
+time to set up a memory mapping.
+.TP
+lat_ops
+basic processor operations, such as integer XOR, ADD, SUB, MUL, DIV,
+and MOD, and float ADD, MUL, DIV, and double ADD, MUL, DIV.
+.TP
+lat_pipe
+``hot potato'' transaction through a Unix pipe.
+.TP
+lat_proc
+process creation times (various sorts).
+.TP
+lat_rpc
+``hot potato'' transaction through Sun RPC over UDP or TCP.
+.TP
+lat_select
+select latency
+.TP
+lat_sig
+signal installation and catch latencies.  Also protection fault signal
+latency.
+.TP
+lat_syscall
+non trivial entry into the system.
+.TP
+lat_tcp
+``hot potato'' transaction through TCP.
+.TP
+lat_udp
+``hot potato'' transaction through UDP.
+.TP
+lat_unix
+``hot potato'' transaction through UNIX sockets.
+.TP
+lat_unix_connect
+the time it takes to establish a UNIX socket connection.
+.SH OTHER MEASUREMENTS
+.TP 14
+mhz
+processor cycle time
+.TP
+tlb
+TLB size and TLB miss latency
+.TP
+line
+cache line size (in bytes)
+.TP
+cache
+cache statistics, such as line size, cache sizes, memory parallelism.
+.TP
+stream
+John McCalpin's stream benchmark
+.TP
+par_mem
+memory subsystem parallelism.  How many requests can the memory
+subsystem service in parallel, which may depend on the location of the
+data in the memory hierarchy.  
+.TP
+par_ops
+basic processor operation parallelism.
+.SH SEE ALSO
+bargraph(1),
+graph(1),
+lmbench(3),
+results(3),
+timing(3),
+bw_file_rd(8), 
+bw_mem_cp(8), 
+bw_mem_wr(8), 
+bw_mmap_rd(8), 
+bw_pipe(8), 
+bw_tcp(8),
+bw_unix(8),
+lat_connect(8), 
+lat_ctx(8),
+lat_fcntl(8),
+lat_fifo(8),
+lat_fs(8),
+lat_http(8),
+lat_mem_rd(8),
+lat_mmap(8),
+lat_ops(8),
+lat_pagefault(8),
+lat_pipe(8),
+lat_proc(8),
+lat_rpc(8),
+lat_select(8),
+lat_sig(8),
+lat_syscall(8),
+lat_tcp(8),
+lat_udp(8),
+lmdd(8),
+par_ops(8),
+par_mem(8),
+mhz(8),
+tlb(8),
+line(8),
+cache(8),
+stream(8)
+.SH ACKNOWLEDGEMENT
+Funding for the development of these tools was provided by Sun
+Microsystems Computer Corporation.
+.LP
+A large number of people have contributed to the testing and
+development of lmbench.
+.SH COPYING
+The benchmarking code is distributed under the GPL with additional 
+restrictions, see the COPYING file.
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/lmbench3.ms b/performance/lmbench3/doc/lmbench3.ms
new file mode 100755
index 0000000..fa41323
--- /dev/null
+++ b/performance/lmbench3/doc/lmbench3.ms
@@ -0,0 +1,1853 @@
+.\" This document is GNU groff -mgs -t -p -R -s
+.\" It will not print with normal troffs, it uses groff features, in particular,
+.\" long names for registers & strings.
+.\" Deal with it and use groff - it makes things portable.
+.\"
+.\" $X$ xroff -mgs -t -p -R -s $file
+.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more
+.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr
+.VARPS
+.\" Define a page top that looks cool
+.\" HELLO CARL!  To turn this off, s/PT/oldPT/
+.de PT
+.\" .tl '\fBDRAFT\fP'\\*(DY'\fBDRAFT\fP'
+.tl ''''
+..
+.de lmPT
+.if \\n%>1 \{\
+.	sp -.1i
+.	ps 14
+.	ft 3
+.	nr big 24
+.	nr space \\w'XXX'
+.	nr titlewid \\w'\\*[title]'
+.	nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2
+.	ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25'
+.	ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0
+.	ce 1
+\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar]
+.	ps
+.	sp -.70
+.	ps 12
+\\l'\\n[LL]u'
+.	ft
+.	ps
+.\}
+..
+.\" Define a page bottom that looks cool
+.\" HELLO CARL!  To turn this off, s/BT/oldBT/
+.de BT
+.tl ''Page %''
+..
+.de lmBT
+.	ps 9
+\v'-1'\\l'\\n(LLu'
+.	sp -1
+.	tl '\(co 2002 \\*[author]'\\*(DY'%'
+.	ps
+..
+.de SP
+.	if t .sp .5
+.	if n .sp 1
+..
+.de BU
+.	SP
+.	ne 2
+\(bu\ 
+.	if \\n[.$] \fB\\$1\fP\\$2
+..
+.nr FIGURE 0
+.nr TABLE 0
+.nr SMALL .25i
+.de TSTART
+.	KF
+.	if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0
+.	ps -1
+.	vs -1
+..
+.de TEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr TABLE \\n[TABLE]+1
+.	ce 1
+\fBTable \\n[TABLE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.de FEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr FIGURE \\n[FIGURE]+1
+.	ce 1
+\fBFigure \\n[FIGURE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.\" Configuration
+.nr PI 3n
+.nr HM 1i
+.nr FM 1i
+.nr PO 1i
+.if t .po 1i
+.nr LL 6.5i
+.if n .nr PO 0i
+.if n .nr LL 7.5i
+.nr PS 10
+.nr VS \n(PS+1
+.ds title Measuring scalability
+.ds author Carl Staelin
+.ds micro \(*m
+.ds lmbench \f(CWlmbench\fP
+.ds lmbench1 \f(CWlmbench1\fP
+.ds lmbench2 \f(CWlmbench2\fP
+.ds lmbench3 \f(CWlmbench3\fP
+.ds bcopy \f(CWbcopy\fP
+.ds benchmp  \f(CWbenchmp\fP
+.ds bw_file_rd \f(CWbw_file_rd\fP
+.ds bw_mem \f(CWbw_mem\fP
+.ds bw_mmap_rd \f(CWbw_mmap_rd\fP
+.ds bw_pipe \f(CWbw_pipe\fP
+.ds bw_tcp \f(CWbw_tcp\fP
+.ds bw_udp \f(CWbw_udp\fP
+.ds bw_unix \f(CWbw_unix\fP
+.ds close \f(CWclose\fP
+.ds connect \f(CWconnect\fP
+.ds dd \f(CWdd\fP
+.ds execlp  \f(CWexeclp\fP
+.ds execve  \f(CWexecve\fP
+.ds exit \f(CWexit\fP
+.ds fcntl \f(CWfcntl\fP
+.ds fork \f(CWfork\fP
+.ds fstat \f(CWfstat\fP
+.ds gcc \f(CWgcc\fP
+.ds get_n \f(CWget_n\fP
+.ds getpid \f(CWgetpid\fP
+.ds getppid \f(CWgetppid\fP
+.ds gettime \f(CWgettime\fP
+.ds gettimeofday \f(CWgettimeofday\fP
+.ds kill \f(CWkill\fP
+.ds lat_connect \f(CWlat_connect\fP
+.ds lat_ctx \f(CWlat_ctx\fP
+.ds lat_dram_page \f(CWlat_dram_page\fP
+.ds lat_fcntl \f(CWlat_fcntl\fP
+.ds lat_fifo \f(CWlat_fifo\fP
+.ds lat_fs \f(CWlat_fs\fP
+.ds lat_http \f(CWlat_http\fP
+.ds lat_mem_rd \f(CWlat_mem_rd\fP
+.ds lat_mmap \f(CWlat_mmap\fP
+.ds lat_ops \f(CWlat_ops\fP
+.ds lat_pagefault \f(CWlat_pagefault\fP
+.ds lat_pipe \f(CWlat_pipe\fP
+.ds lat_proc \f(CWlat_proc\fP
+.ds lat_rpc \f(CWlat_rpc\fP
+.ds lat_select \f(CWlat_select\fP
+.ds lat_sem \f(CWlat_sem\fP
+.ds lat_sig \f(CWlat_sig\fP
+.ds lat_syscall \f(CWlat_syscall\fP
+.ds lat_tcp \f(CWlat_tcp\fP
+.ds lat_udp \f(CWlat_udp\fP
+.ds lat_unix \f(CWlat_unix\fP
+.ds lat_unix_connect \f(CWlat_unix_connect\fP
+.ds lat_usleep \f(CWlat_usleep\fP
+.ds line \f(CWline\fP
+.ds lmdd  \f(CWlmdd\fP
+.ds lmdd \f(CWlmdd\fP
+.ds mb \f(CWmb\fP
+.ds memmove \f(CWmemmove\fP
+.ds mhz  \f(CWmhz\fP
+.ds micro \f(CWmicro\fP
+.ds mmap \f(CWmmap\fP
+.ds nano \f(CWnano\fP
+.ds nanosleep \f(CWnanosleep\fP
+.ds open \f(CWopen\fP
+.ds par_mem  \f(CWpar_mem\fP
+.ds par_ops  \f(CWpar_ops\fP
+.ds pipe  \f(CWpipe\fP
+.ds popen  \f(CWpopen\fP
+.ds pselect  \f(CWpselect\fP
+.ds read \f(CWread\fP
+.ds select  \f(CWselect\fP
+.ds semop \f(CWsemop\fP
+.ds setitimer \f(CWsetitimer\fP
+.ds sh  \f(CW/bin/sh\fP
+.ds stat \f(CWstat\fP
+.ds stream \f(CWstream\fP
+.ds system  \f(CWsystem\fP
+.ds tlb \f(CWtlb\fP
+.ds uiomove \f(CWuiomove\fP
+.ds usleep \f(CWusleep\fP
+.ds write \f(CWwrite\fP
+.ds yield  \f(CWyield\fP
+.\" References stuff
+.R1
+accumulate
+sort A+DT
+database references-lmbench3
+label-in-text
+bracket-label [ ] ", "
+.R2
+.EQ
+delim $$
+.EN
+.TL
+\s(14lmbench3: measuring scalability\s0
+.AU
+\s+2\fR\*[author]\fP\s0
+.AI
+\fI\s+2Hewlett-Packard Laboratories Israel\s0\fP
+.SP
+.AB
+\*[lmbench3] extends the \*[lmbench2] 
+system to measure a system's performance under scalable 
+load to make it possible to assess parallel and 
+distributed computer performance with the same
+power and flexibility that \*[lmbench2] brought
+to uni-processor performance analysis.
+There is a new timing harness, \*[benchmp], designed 
+to measure performance at specific levels of parallel 
+(simultaneous) load, and most existing benchmarks have
+been converted to use the new harness.
+.SP
+\*[lmbench] is a micro-benchmark suite designed to focus
+attention on the basic building blocks of many
+common system applications, such as databases, simulations, 
+software development, and networking.  
+It is also designed to make it easy for users to create
+additional micro-benchmarks that can measure features, 
+algorithms, or subsystems of particular interest to the
+user.
+.AE
+.if t .MC 3.05i
+.NH 1
+Introduction
+.LP
+\*[lmbench] is a widely used suite of micro-benchmarks
+that measures important aspects of computer system
+performance, such as memory latency and bandwidth.
+Crucially, the suite is written in portable ANSI-C
+using POSIX interfaces and is intended to run on a 
+wide range of systems without modification.
+.LP
+The benchmarks included in the suite were chosen
+because in the \*[lmbench] developer's experience,
+they each represent an aspect of system performance
+which has been crucial to an application's
+performance.  
+Using this multi-dimensional performance analysis
+approach, it is possible to better predict and 
+understand application performance because
+key aspects of application performance can often be
+understood as linear combinations of the elements 
+measured by \*[lmbench]
+.[[
+Brown97
+.]].
+.LP
+\*[lmbench3] extends the \*[lmbench] suite to
+encompass parallel and distributed system performance
+by measuring system performance under scalable load.
+This means that the user can specify the number of
+processes that will be executing the benchmarked
+feature in parallel during the measurements.
+It is possible to utilize this framework to develop
+benchmarks to measure distributed application
+performance, but it is primarily intended to
+measure the performance of multiple processes 
+using the same system resource at the same time.
+.LP
+In general the benchmarks report either the latency
+or bandwidth of an operation or data pathway.  The
+exceptions are generally those benchmarks that
+report on a specific aspect of the hardware, such
+as the processor clock rate, which is reported 
+in MHz and nanoseconds.
+.LP
+\*[lmbench] consists of three major components:
+a timing harness, the individual benchmarks
+built on top of the timing harness, and the
+various scripts and glue that build and run the 
+benchmarks and process the results.
+.NH 2
+\*[lmbench] history
+.LP
+\*[lmbench1] was written by Larry McVoy
+while he was at Sun Microsystems.
+It focussed on two measures of system performance: 
+latency and bandwidth.  
+It measured a number of basic operating system 
+functions, such as file system read/write bandwidth 
+or file creation time.  
+It also focussed a great deal of energy on measuring
+data transfer operations, such as \*[bcopy] and
+\*[pipe] latency and bandwidth as well as raw
+memory latency and bandwidth.
+.LP
+Shortly after the \*[lmbench1] paper 
+.[[
+McVoy96
+.]]
+was published, Aaron Brown examined the \*[lmbench] 
+benchmark suite and published a detailed critique 
+of its strengths and weaknesses
+.[[
+Brown97
+.]].
+Largely in response to these remarks, development
+of \*[lmbench2] began with a focus on improving the 
+experimental design and statistical data analysis.  
+The primary change was the development and adoption 
+across all the benchmarks of a timing harness that 
+incorporated loop-autosizing and clock resolution 
+detection.  
+In addition, each experiment was typically repeated 
+eleven times with the median result reported to the 
+user.
+.LP
+The \*[lmbench2]
+.[[
+Staelin98
+.]]
+timing harness was implemented through a new macro, 
+BENCH(), that automatically manages nearly all aspects 
+of accurately timing operations.
+For example, it automatically detects the minimal timing 
+interval necessary to provide timing results within 1% 
+accuracy, and it automatically repeats most experiments 
+eleven times and reports the median result.
+.LP
+\*[lmbench3] focussed on extending 
+\*[lmbench]'s functionality along two dimensions:
+measuring multi-processor scalability and measuring
+basic aspects of processor micro-architecture.
+.LP
+An important feature of multi-processor systems is their
+ability to scale their performance.  
+While \*[lmbench1] and \*[lmbench2] measure various 
+important aspects of system performance, they cannot
+measure performance with more than one client process
+active at a time.
+Consequently, measuring performance of multi-processor
+and clustered systems as a function of scalable load
+was impossible using those tools.
+.LP
+\*[lmbench3] took the ideas and techniques
+developed in the earlier versions and extended them
+to create a new timing harness which can measure
+system performance under parallel, scalable loads.
+.LP
+\*[lmbench3] also includes a version of John 
+McCalpin's STREAM benchmarks.  Essentially the STREAM 
+kernels were placed in the new \*[lmbench] timing harness.
+Since the new timing harness also measures scalability
+under parallel load, the \*[lmbench3] STREAM
+benchmarks include this capability automatically.  
+.LP
+Finally, \*[lmbench3] includes a number of new
+benchmarks which measure various aspects of the
+processor architecture, such as basic operation
+latency and parallelism, to provide developers
+with a better understanding of system capabilities.
+The hope is that better informed developers will
+be able to better design and evaluate performance
+critical software in light of their increased
+understanding of basic system performance.
+.NH 1
+Prior Work
+.LP
+Benchmarking is not a new field of endeavor.
+There are a wide variety of approaches to 
+benchmarking, many of which differ greatly
+from that taken by \*[lmbench].  
+.LP
+One common form of benchmark is to take an
+important application or application and
+worklist, and to measure the time required
+to complete the entire task.  
+This approach is particularly useful when 
+evaluating the utility of systems for a 
+single and well-known task.
+.LP
+Other benchmarks, such as SPECint, use a
+variation on this approach by measuring
+several applications and combining the
+results to predict overall performance.
+SPEChpc96 
+.[[
+SPEChpc96
+.]]
+extends this approach to the
+parallel and distributed domain by measuring
+the performance of a selected parallel
+applications built on top of MPI and/or PVM.
+.\" .LP
+.\" XXX Byte benchmark
+.LP
+Another variation takes the "kernel" of
+an important application and measures its
+performance, where the "kernel" is usually
+a simplification of the most expensive
+portion of a program.  
+Dhrystone 
+.[[
+Weicker84
+.]]
+is an example of this type of
+benchmark as it measures the performance
+of important matrix operations and was often
+used to predict system performance for
+numerical operations.
+.LP
+Banga developed a benchmark to measure HTTP server
+performance which can accurately measure
+server performance under high load
+.[[
+Banga97
+.]].
+Due to the idiosyncracies of the HTTP protocol 
+and TCP design and implementation, there are 
+generally operating system limits on the rate 
+at which a single system can generate 
+independent HTTP requests.  
+However, Banga developed a system which can 
+scalably present load to HTTP servers in spite 
+of this limitation
+.[[
+Banga98
+.]].
+.LP
+John McCalpin's STREAM benchmark measures
+memory bandwidth during four common vector
+operations
+.[[
+McCalpin95
+.]].
+It does not measure memory latency, and
+strictly speaking it does not measure raw
+memory bandwith although memory bandwidth
+is crucial to STREAM performance.
+More recently, STREAM has been extended to
+measure distributed application performance
+using MPI to measure scalable memory subsystem
+performance, particularly for multi-processor
+machines.
+.LP
+Prestor
+.[[
+Prestor01
+.]]
+and Saavedra
+.[[
+Saavedra95
+.]]
+have developed benchmarks which analyze 
+memory subsystem performance.
+.LP
+Micro-benchmarking extends the "kernel" 
+approach, by measuring the performance
+of operations or resources in isolation.
+\*[lmbench] and many other benchmarks, such 
+as nfsstone
+.[[
+Shein89
+.]],
+measure the performance of key operations so 
+users can predict performance for certain 
+workloads and applications by combining the 
+performance of these operations in the right 
+mixture.
+.LP
+Saavedra
+.[[
+Saavedra92
+.]]
+takes the micro-benchmark approach and applies
+it to the problem of predicting application
+performance. 
+They analyze applications or other benchmarks
+in terms of their ``narrow spectrum benchmarks''
+to create a linear model of the application's
+computing requirements.  
+They then measure the computer system's 
+performance across this set of micro-benchmarks
+and use a linear model to predict the application's
+performance on the computer system.
+Seltzer
+.[[
+Seltzer99
+.]]
+applied this technique using the features
+measured by \*[lmbench] as the basis for
+application prediction.
+.LP
+Benchmarking I/O systems has proven particularly
+troublesome over the years, largely due to the
+strong non-linearities exhibited by disk systems.
+Sequential I/O provides much higher bandwidth
+than non-sequential I/O, so performance is 
+highly dependent on the workload characteristics
+as well as the file system's ability to 
+capitalize on available sequentiality by
+laying out data contiguously on disk.
+.LP
+I/O benchmarks have a tendency to age poorly.
+For example, IOStone
+.[[
+Park90a
+.]],
+IOBench
+.[[
+Wolman89
+.]],
+and the Andrew benchmark
+.[[
+Howard88
+.]]
+used fixed size datasets, whose size was
+significant at the time, but which no longer
+measure I/O performance as the data can now
+fit in the processor cache of many modern
+machines.
+.LP
+The Andrew benchmark attempts to separately
+measure the time to create, write, re-read, 
+and then delete a large number of files in
+a hierarchical file system.  
+.LP
+Bonnie
+.[[
+Bray90
+.]]
+measures sequential, streaming I/O bandwidth
+for a single process, and random I/O latency
+for multiple processes.
+.LP
+Peter Chen developed an adaptive harness for
+I/O benchmarking
+.[[
+Chen93d
+.]]
+.[[
+Chen94a
+.]],
+which defines I/O load in terms of five parameters,
+uniqueBytes, sizeMean, readFrac, seqFrac, and
+processNum.  The benchmark then explores the
+parameter space to measure file system performance
+in a scalable fashion.
+.LP
+Parkbench
+.[[
+Parkbench
+.]]
+is a benchmark suite that can analyze parallel
+and distributed computer performance.
+It contains a variety of benchmarks that measure
+both aspects of system performance, such as
+communication overheads, and distributed application
+kernel performance.
+Parkbench contains benchmarks from both NAS
+.[[
+NAS
+.]]
+and Genesis
+.[[
+Glendinning94
+.]].
+.NH 1
+Timing Harness
+.LP
+The first, and most crucial element in extending
+\*[lmbench2] so that it could measure scalable
+performance, was to develop a new timing harness
+that could accurately measure performance for
+any given load.
+Once this was done, then each benchmark would
+be migrated to the new timing harness.
+.LP
+The harness is designed to accomplish a number
+of goals:
+.IP 1.
+during any timing interval of any child it is
+guaranteed that all other child processes are
+also running the benchmark
+.IP 2.
+the timing intervals are long enough to average
+out most transient OS scheduler affects
+.IP 3.
+the timing intervals are long enough to ensure
+that error due to clock resolution is negligible
+.IP 4.
+timing measurements can be postponed to allow
+the OS scheduler to settle and adjust to the
+load
+.IP 5.
+the reported results should be representative 
+and the data analysis should be robust
+.IP 6.
+timing intervals should be as short as possible
+while ensuring accurate results
+.LP
+Developing an accurate timing harness with a
+valid experimental design is more difficult 
+than is generally supposed.
+Many programs incorporate elementary timing
+harnesses which may suffer from one or more
+defects, such as insufficient care taken to
+ensure that the benchmarked operation is run
+long enough to ensure that the error introduced 
+by the clock resolution is insignificant.
+The basic elements of a good timing harness
+are discussed in 
+Staelin
+.[[
+Staelin98
+.]].
+.LP
+The new timing harness must also collect and process
+the timing results from all the child processes so
+that it can report the representative performance.
+It currently reports the median performance over
+all timing intervals from all child processes.  It
+might perhaps be argued that it should report the
+median of the medians.
+.LP
+When running benchmarks with more than one child,
+the harness must first get a baseline estimate
+of performance by running the benchmark in only
+one process using the standard \*[lmbench] timing
+interval, which is often 5,000 microseconds.
+Using this information, the harness can compute
+the average time per iteration for a single
+process, and it uses this figure to compute the
+number of iterations necessary to ensure that
+each child runs for at least one second.
+.NH 2
+Clock resolution
+.LP
+\*[lmbench] uses the \*[gettimeofday] clock, whose 
+interface resolves time down to 1 microsecond.  
+However, many system clock's resolution is only 10 
+milli-seconds, and there is no portable way to query 
+the system to discover the true clock resolution.
+.LP
+The problem is that the timing intervals must
+be substantially larger than the clock resolution
+in order to ensure that the timing error doesn't
+impact the results.  For example, the true duration
+of an event measured with a 10 milli-second clock
+can vary $+-$10 milli-seconds from the true time,
+assuming that the reported time is always a
+truncated version of the true time.  If the clock
+itself is not updated precisely, the true error
+can be even larger.  
+This implies that timing intervals on these systems
+should be at least 1 second.
+.LP
+However, the \*[gettimeofday] clock resolution in
+most modern systems is 1 microsecond, so timing
+intervals can as small as a few milli-seconds
+without incurring significant timing errors related
+to clock resolution.
+.LP
+Since there is no standard interface to query the operating
+system for the clock resolution, \*[lmbench] must 
+experimentally determine the appropriate timing 
+interval duration which provides results in a timely 
+fashion with a negligible clock resolution error.
+.NH 2
+Coordination
+.LP
+Developing a timing harness that correctly manages 
+$N$ processes and accurately measures system performance 
+over those same $N$ processes is significantly more difficult
+than simply measuring system performance with a single
+process because of the asynchronous nature of
+parallel programming.
+.LP
+In essence, the new timing harness needs to create
+$N$ jobs, and measure the average performance of the
+target subsystem while all $N$ jobs are running.  This
+is a standard problem for parallel and distributed
+programming, and involves starting the child
+processes and then stepping through a handshaking
+process to ensure that all children have started
+executing the benchmarked operation before any child
+starts taking measurements.
+.TSTART
+.TS
+box tab (/) box expand ;
+c c
+l l .
+Parent/Child
+T{
+\(bu start up P child processes
+T}/
+T{
+\(bu wait for P \fIready\fR signals
+T}/T{
+\(bu run benchmark operation for a little while
+T}
+\(da/T{
+\(bu send a \fIready\fR signal
+T}
+T{
+\(bu on reciept of \fIready\fR signals, sleep for \fIwarmup\fR \*[micro]s
+T}/T{
+\(bu run benchmark operation while polling for a \fIgo\fR signal
+T}
+T{
+\(bu send \fIgo\fR signal to P children
+T}/\(da
+T{
+\(bu wait for P \fIdone\fR signals
+T}/T{
+\(bu on receipt of \fIgo\fR signal, begin timing benchmark operation
+T}
+\(da/T{
+\(bu send a \fIdone\fR signal
+T}
+T{
+\(bu one receipt of \fIdone\fR signals, iterate through children 
+sending \fIresults\fR signal and gathering results
+T}/T{
+\(bu run benchmark operation while polling for a \fIresults\fR signal
+T}
+T{
+\(bu collate results
+T}/T{
+\(bu on receipt of \fIresults\fR signal, send timing results 
+and wait for \fIexit\fR signal
+T}
+T{
+\(bu send \fIexit\fR signal
+T}/\(da
+/T{
+\(bu exit
+T}
+.TE
+.TEND "Timing harness sequencing"
+.nr TABLEseq \n[TABLE]
+.LP
+Table \n[TABLEseq] shows how the parent and child
+processes coordinate their activities to ensure
+that all children are actively running the
+benchmark activity while any child could be
+taking timing measurements.
+.LP
+The reason for the separate "exit" signal is
+to ensure that all properly managed children
+are alive until the parent allows them to die.
+This means that any SIGCHLD events that occur
+before the "exit" signal indicate a child 
+failure.
+.NH 2
+Accuracy
+.LP
+The new timing harness also needs to ensure that the 
+timing intervals are long enough for the results to 
+be representative.  The previous timing harness assumed
+that only single process results were important, and
+it was able to use timing intervals as short as
+possible while ensuring that errors introduced by
+the clock resolution were negligible.  
+In many instances this meant that the timing intervals 
+were smaller than a single scheduler time slice.  
+The new timing harness must run benchmarked operations
+long enough to ensure that timing intervals are longer
+than a single scheduler time slice.
+Otherwise, you can get results which are complete nonsense.  
+For example, running several copies of an \*[lmbench2] 
+benchmark on a uni-processor machine will often report 
+that the per-process performance with $N$ jobs running in 
+parallel is equivalent to the performance with a single 
+job running!\**
+.FS
+This was discovered by someone who naively attempted
+to parallelize \*[lmbench2] in this fashion, and I
+received a note from the dismayed developer describing
+the failed experiment.
+.FE
+.LP
+In addition, since the timing intervals now have to be
+longer than a single scheduler time slice, they also
+need to be long enough so that a single scheduler time
+slice is insignificant compared to the timing interval.
+Otherwise the timing results can be dramatically 
+affected by small variations in the scheduler's
+behavior.
+.LP
+Currently \*[lmbench] does not measure the scheduler
+timeslice; the design blithely assumes that timeslices
+are generally on the order of 10-20ms, so one second
+timing intervals are sufficient.
+Some schedulers may utilize longer time slices, but
+this has not (yet) been a problem.
+.NH 2
+Resource consumption
+.LP
+One important design goal was that resource consumption
+be constant with respect to the number of child
+processes.  
+This is why the harness uses shared pipes to communicate
+with the children, rather than having a separate set of
+pipes to communicate with each child.
+An early design of the system utilized a pair of pipes
+per child for communication and synchronization between
+the master and slave processes.  However, as the number
+of child processes grew, the fraction of system 
+resources consumed by the harness grew and the additional
+system overhead could start to interfere with the accuracy 
+of the measurements.
+.LP
+Additionally, if the master has to poll (\*[select])
+$N$ pipes, then the system overhead of that operation
+also scales with the number of children.  
+.NH 2
+Pipe atomicity
+.LP
+Since all communication between the master process and
+the slave (child) processes is done via a set of shared
+pipes, we have to ensure that we never have a situation
+where the message can be garbled by the intermingling
+of two separate messages from two separate children.
+This is ensured by either using pipe operations that
+are guaranteed to be atomic on all machines, or by
+coordinating between processes so that at most one
+process is writing at a time.
+.LP
+The atomicity guarantees are provided by having each
+client communicate synchronization states in one-byte 
+messages.  For example, the signals from the master
+to each child are one-byte messages, so each child
+only reads a single byte from the pipe.  Similarly,
+the responses from the children back to the master
+are also one-byte messages.  In this way no child
+can receive partial messages, and no message can
+be interleaved with any other message.
+.LP
+However, using this design means that we need to
+have a separate pipe for each \fIbarrier\fR in
+the process, so the master uses three pipes to
+send messages to the children, namely: \fIstart_signal\fR,
+\fIresult_signal\fR, and \fIexit_signal\fR.
+If a single pipe was used for all three barrier events,
+then it is possible for a child to miss a signal,
+or if the signal is encoded into the message, 
+then it is possible for a child to infinite loop
+pulling a signal off the pipe, recognizing that
+it has already received that signal so that it
+needs to push it back into the pipe, and then
+then re-receiving the same message it just re-sent.
+.LP
+However, all children share a single pipe to send
+data back to the master process.  Usually the
+messages on this pipe are single-byte signals,
+such as \fIready\fR or \fIdone\fR.  However, the
+timing data results need to be sent from the
+children to the master and they are (much) larger
+than a single-byte message.  In this case, the
+timing harness sends a single-byte message on
+the \fIresult_signal\fR channel, which can be
+received by at most one child process.  This
+child then knows that it has sole ownership of
+the response pipe, and it writes its entire 
+set of timing results to this pipe.  Once the
+master has received all of the timing results
+from a single child, it sends the next one-byte
+message on the \fIresult_signal\fR channel to
+gather the next set of timing results.
+.TSTART 1
+.so lmbench3_signals.pic
+.FEND "Control signals" 1
+.nr FIGUREsig \n[FIGURE]
+.LP
+The design of the signals is shown in Figure \n[FIGUREsig].
+.NH 2
+Benchmark initialization
+.LP
+By allowing the benchmark to specify an
+initialization routine that is run in the
+child processes, the new timing harness
+allows benchmarks to do either or both
+global initializations that are shared
+by all children and specific per-child
+initializations that are done independently
+by each child.
+Global initialization is done in the
+master process before the \*[benchmp] 
+harness is called, so the state is 
+preserved across the \*[fork] operations.
+Per-child initialization is done inside
+the \*[benchmp] harness by the optional
+initialization routine and is done after
+the \*[fork] operation.
+.LP
+Similarly, each benchmark is allowed to
+specify a cleanup routine that is run by
+the child processes just before exiting.
+This allows the benchmark routines to
+release any resources that they may have
+used during the benchmark.
+Most system resources would be automatically
+released on process exit, such as file
+descriptors and shared memory segments,
+but some resources such as temporary files
+might need to be explicitly released by
+the benchmark.
+.NH 2
+Scheduler transients
+.LP
+Particularly on multi-processor systems, side-effects
+of process migration can dramatically affect program 
+runtimes.  For example, if the processes are all
+initially assigned to the same processor as the parent
+process, and the timing is done before the scheduler
+migrates the processes to other available processors,
+then the system performance will appear to be that of
+a uniprocessor.  Similarly, if the scheduler is
+over-enthusiastic about re-assigning processes to
+processors, then performance will be worse than
+necessary because the processes will keep encountering
+cold caches and will pay exhorbitant memory access
+costs.
+.LP
+The first case is a scheduler transient, and users
+may not want to measure such transient phenomena
+if their primary interest is in predicting performance
+for long-running programs.  Conversely, that same
+user would be extraordinarily interested in the
+second phenomena.  The harness was designed to
+allow users to specify that the benchmarked processes
+are run for long enough to (hopefully) get the
+scheduler past the transient startup phase, so it
+can measure the steady-state behavior.
+.NH 2
+Data analysis
+.LP
+Analyzing the data to produce representative results
+is a crucial step in the benchmarking process.  
+\*[lmbench] generally reports the \fImedian\fP
+result for $11$ measurements.  
+Most benchmarks report the results of a single measurement
+.[[
+Howard88
+.]],
+an average of several results
+.[[
+McCalpin95
+.]],
+or a trimmed mean
+.[[
+Brown97
+.]].
+.\" XXX UNKNOWN:
+.\" .RN Weicker84,Shein89,Park,Wolman89,Banga97,Saavedra92,Chen94a,Bray90
+.LP
+Since \*[lmbench] is able to use timing intervals
+that are often smaller than a scheduler time slice
+when measuring single-process performance, the raw 
+timing results are often severely skewed.
+Often most results cluster around a single value
+a small number of outliers with significantly
+larger values.
+The median is preferable to the mean when the data
+can be very skewed
+.[[
+Jain91
+.]].
+Since the timing intervals are significantly longer
+when the desired load is larger than a single
+process, the results tend not to be as badly skewed.
+In these cases we could use the \fImean\fR instead,
+but we decide to use a uniform statistical framework,
+so we usually use the median.
+.LP
+In some instances, however, \*[lmbench] internally
+uses the \fIminimum\fP rather than the median, 
+such as in \*[mhz].  
+In those instances, we are not trying to find the 
+\fIrepresentative\fP value, but rather the 
+\fIminimum\fP value.
+There are only a few sources of error which could
+cause a the measured timing result to be shorter 
+than the true elapsed time: the system clock is
+adjusted, or round-off error in the clock resolution.
+The timing interval duration is set to ensure that
+the round-off error is bounded to 1% of the timing
+interval, and we blithely assume that people don't
+reset their system clocks while benchmarking their
+systems.
+.LP
+\*[lmbench] does not currently report any statistics
+representing measurement variation, such as the 
+difference between the first and third quartiles.
+This is an enhancement under active consideration.
+.NH 1
+Interface
+.LP
+Unfortunately we had to move away from the
+macro-based timing harness used in \*[lmbench2] 
+and migrate to a function-based system
+because the macros were too large for some
+C pre-processors.
+.TSTART 1
+.DS L
+\f(CWtypedef void (*bench_f)(iter_t iters,
+			void* cookie);
+typedef void (*support_f)(void* cookie);
+
+extern void benchmp(support_f initialize,
+		bench_f benchmark,
+		support_f cleanup,
+		int enough,
+		int parallel,
+		int warmup,
+		int repetitions,
+		void* cookie);
+
+extern uint64 gettime();
+extern uint64 get_n();
+extern void nano(char* s, uint64 n);
+extern void micro(char* s, uint64 n);
+extern void mb(uint64 bytes);\fP
+.DE
+.FEND "Programming interface" 1
+.nr FIGinterface \n[FIGURE]
+.LP
+Figure \n[FIGinterface] shows the key elements
+of the new timing harness and result reporting 
+interface.
+A brief description of the \*[benchmp] parameters:
+.IP \fIenough\fR
+Enough can be used to ensure that a timing interval is at
+least 'enough' microseconds in duration.  For most benchmarks
+this should be zero, but some benchmarks have to run for more
+time due to startup effects or other transient behavior.
+.IP \fIparallel\fR
+is simply the number of instances of the benchmark
+that will be run in parallel on the system.  
+.IP \fIwarmup\fR
+can be used to force the benchmark to run for warmup
+microseconds before the system starts making timing measurements.
+Note that it is a lower bound, not a fixed value, since it
+is simply the time that the parent sleeps after receiving the
+last "ready" signal from each child (and before it sends 
+the "go" signal to the children).  
+.IP \fIrepetitions\fR
+is the number of times the experiment should
+be repeated.  The default is eleven.
+.IP \fIcookie\fR
+is a pointer that can be used by the benchmark
+writer to pass in configuration information, such as buffer
+size or other parameters needed by the inner loop.  
+In \*[lmbench3] it is generally used to point
+to a structure containing the relevant configuration
+information.
+.LP
+\*[gettime] returns the median timing
+interval duration, while \*[get_n] returns
+the number of iterations executed during
+that timing interval.
+.LP
+\*[nano] and \*[micro] print the passed
+string latency followed by the latency
+in terms of nanoseconds and microseconds
+respectively.
+The latency is computed as $gettime()/n$,
+where $n$ is the passed parameter.
+The reason $n$ is passed as a parameter
+is because the benchmark can actually
+execute the operation of interest multiple
+times during a single iteration.
+For example, the memory latency benchmarks
+typically repeat the memory load operation
+a hundred times inside the loop, so the
+actual number of operations is 
+$100 times get_n()$, and it is this value
+that should be passed to \*[nano] or \*[micro].
+.LP
+\*[mb] reports the bandwidth in MB/s
+when given the total number of bytes
+processed during the timing interval.
+Note that for scalable benchmarks that
+process $"size"$ bytes per iteration, the
+total number of bytes processed is
+$get_n() times parallel times "size"$.
+.TSTART 1
+.DS L
+\f(CW#include "bench.h"
+
+void
+bench(iter_t iters, void* cookie)
+{
+	while (iters-- > 0) {
+		getppid();
+	}
+}
+
+int
+main(int argc, char* argv[])
+{
+	benchmp(NULL, bench, NULL,
+		0, 1, 0, TRIES, NULL);
+	nano("getppid", get_n());
+	return(0);
+}\fP
+.DE
+.FEND "A sample benchmark" 1
+.nr FIGsample \n[FIGURE]
+.LP
+Figure \n[FIGsample] shows a sample benchmark
+that measures the latency of the \*[getppid]
+system call using this timing harness.
+Since there is no setup or cleanup needed
+for this benchmark, the \fIinitialize\fR
+and \fIcleanup\fR parameters are NULL.
+The \fIbench\fR routine simply calls 
+\*[getppid] as many times as requested,
+and the rest of the parameters, \fIenough\fR,
+\fIparallel\fR, \fIwarmup\fR, 
+\fIrepetitions\fR, and \fIcookie\fR
+are given with the default values.
+.NH 1
+Benchmarks
+.LP
+\*[lmbench] contains a large number of micro-benchmarks
+that measure various aspects of hardware and operating
+system performance.  The benchmarks generally measure
+latency or bandwidth, but some new benchmarks also
+measure instruction-level parallelism.
+.TSTART
+.TS
+center box tab (&);
+c c 
+l & l .
+Name&Measures
+_
+&\fBBandwidth\fR
+\fIbw_file_rd\fR&T{
+\*[read] and then load into processor
+T}
+\fIbw_mem\fR&T{
+read, write, and copy data to/from memory
+T}
+\fIbw_mmap_rd\fR&read from \*[mmap]'ed memory
+\fIbw_pipe\fR&\*[pipe] inter-process data copy
+\fIbw_tcp\fR&TCP inter-process data copy
+\fIbw_unix\fR&UNIX inter-process
+_
+&\fBLatency\fR
+lat_connect&TCP connection
+\fIlat_ctx\fR&T{
+context switch via \*[pipe]-based ``hot-potato'' token passing
+T}
+lat_dram_page&T{
+DRAM page open
+T}
+\fIlat_fcntl\fR&T{
+\*[fcntl] file locking ``hot-potato'' token passing
+T}
+\fIlat_fifo\fR&T{
+FIFO ``hot-potato'' token passing
+T}
+lat_fs&file creation and deletion
+lat_http&http GET request latency
+\fIlat_mem_rd\fR&memory read
+\fIlat_mmap\fR&\*[mmap] operation
+\fIlat_ops\fR&T{
+basic operations (\fIxor\fR, \fIadd\fR, \fImul\fR, \fIdiv\fR, \fImod\fR)
+on (relevant) basic data types (\fIint\fR, \fIint64\fR, \fIfloat\fR, 
+\fIdouble\fR)
+T}
+\fIlat_pagefault\fR&page fault handler
+\fIlat_pipe\fR&\*[pipe] ``hot-potato'' token passing
+\fIlat_pmake\fR&T{
+time to complete $N$ parallel jobs that each do $usecs$-worth of work
+T}
+\fIlat_proc\fR&T{
+procedure call overhead and process creation using \*[fork],
+\*[fork] and \*[execve], and \*[fork] and \*[sh]
+T}
+\fIlat_rand\fR&T{
+random number generator
+T}
+\fIlat_rpc\fR&SUN RPC procedure call
+\fIlat_select\fR&\*[select] operation
+\fIlat_sem\fR&T{
+semaphore ``hot-potato'' token passing
+T}
+\fIlat_sig\fR&T{
+signal handle installation and handling
+T}
+\fIlat_syscall\fR&T{
+\*[open], \*[close], \*[getppid], \*[write], \*[stat], \*[fstat]
+T}
+\fIlat_tcp\fR&TCP ``hot-potato'' token passing
+\fIlat_udp\fR&UDP ``hot-potato'' token passing
+\fIlat_unix\fR&UNIX ``hot-potato'' token passing
+\fIlat_unix_connect\fR&UNIX socket connection
+\fIlat_usleep\fR&T{
+\*[usleep], \*[select], \*[pselect], \*[nanosleep], \*[setitimer] 
+timer resolution
+T}
+_
+&\fBOther\fR
+disk&T{
+zone bandwidths and seek times
+T}
+line&cache line size
+lmdd&\fIdd\fR clone
+par_mem&memory subsystem ILP
+par_ops&basic operation ILP
+\fIstream\fR&STREAM clones
+tlb&TLB size
+.TE
+.TEND "\*[lmbench] micro-benchmarks"
+.nr TABLEbench \n[TABLE]
+.LP
+Table \n[TABLEbench] contains the full list of micro-benchmarks
+in \*[lmbench3].
+Benchmarks that were converted to measure performance
+under scalable load are shown in italics, while the
+remaining benchmarks are shown with normal typeface.  
+A detailed description of most benchmarks can be found in
+.[[
+McVoy96
+.]].
+.NH 1
+Scaling Benchmarks
+.LP
+There are a number of issues associated with converting
+single-process benchmarks with a single process to 
+scalable benchmarks with several independent processes,
+in addition to the various issues addressed by
+the timing harness.
+Many of the benchmarks consume or utilize system
+resources, such as memory or network bandwidth,
+and a careful assessment of the likely resource
+contention issues is necessary to ensure that the
+benchmarks measure important aspects of system performance
+and not artifacts of artificial resource contention.
+.LP
+For example, the Linux 2.2 and 2.4 kernels use a single lock to
+control access to the kernel data structures for a file.
+This means that multiple processes accessing that file
+will have their operations serialized by that lock.
+If one is interested in how well a system can handle
+multiple independent accesses to separate files and
+if the child processes all access the same file, then
+this file sharing is an artificial source of contention
+with potentially dramatic effects on the benchmark
+results.
+.NH 2
+File System
+.LP
+A number of the benchmarks measure aspects of file system
+performance, such as \*[bw_file_rd], \*[bw_mmap_rd], 
+\*[lat_mmap], and \*[lat_pagefault].
+It is not immediately apparent how these benchmarks should
+be extended to the parallel domain.  For example, it may
+be important to know how file system performance scales
+when multiple processes are reading the same file, or
+when multiple processes are reading different files.
+The first case might be important for large, distributed 
+scientific calculations, while the second might be more 
+important for a web server.
+.LP
+However, for the operating system, the two cases are
+significantly different.  When multiple processes
+access the same file, access to the kernel data 
+structures for that file must be coordinated and
+so contention and locking of those structures can
+impact performance, while this is less true when
+multiple processes access different files.
+.LP
+In addition, there are any number of issues associated
+with ensuring that the benchmarks are either measuring
+operating system overhead (e.g., that no I/O is actually
+done to disk), or actually measuring the system's I/O
+performance (e.g., that the data cannot be resident in
+the buffer cache).  Especially with file system related
+benchmarks, it is very easy to develop benchmarks that
+compare apples and oranges (e.g., the benchmark includes
+the time to flush data to disk on one system, but only
+includes the time to flush a portion of data to disk on
+another system).
+.LP
+\*[lmbench3] allows the user to measure either case
+as controlled by a command-line switch.  When measuring
+accesses to independent files, the benchmarks first
+create their own private copies of the file, one for
+each child process.  Then each process accesses its
+private file.  When measuring accesses to a single
+file, each child simply uses the designated file
+directly.
+.NH 2
+Context Switching
+.LP
+Measuring context switching accurately is a difficult
+task.  \*[lmbench1] and \*[lmbench2] measured context
+switch times via a "hot-potato" approach using pipes
+connected in a ring.  However, this experimental
+design heavily favors schedulers that do "hand-off"
+scheduling, since at most one process is active at
+a time.
+Consequently, it is not really a good benchmark
+for measuring scheduler overhead in multi-processor
+machines.
+.LP
+The design currently used in \*[lmbench3] is to
+create $N$ \*[lmbench2]-style process rings and
+to measure the context switch times with all $N$
+rings running in parallel.
+This does extend the \*[lmbench2] context switch
+benchmark to a scalable form, but it still suffers
+from the same weaknesses.
+.LP
+One approach that was considered was to replace
+the ring with a star formation, so the master
+process would send tokens to each child and
+then wait for them all to be returned.  
+This has the advantage that more than one process
+is active at a time, reducing the sensitivity
+to "hand-off" scheduling.
+However, this same feature can cause problems
+on a multi-processor system because several
+of the context switches and working set accesses
+can occur in parallel.
+.LP
+The design and methodology for measuring context
+switching and scheduler overhead need to be revisited
+so that it can more accurately measure performance
+for multi-processor machines.
+.NH 1
+Stream
+.LP
+\*[lmbench3] includes a new micro-benchmark, 
+\*[stream] which measures the performance of 
+John McCalpin's STREAM benchmark kernels for 
+both STREAM version 1 
+.[[
+McCalpin95
+.]]
+and version 2
+.[[
+McCalpin2002
+.]].
+This benchmark faithfully recreates each of the
+kernel operations from both STREAM benchmarks,
+and because of the powerful new timing harness it
+can easily measure memory system scalability.
+.TSTART
+.TS
+center box tab (|);
+c s s s s
+c | c | c s | c
+l | l | l | l | l .
+Stream
+_
+Kernel|Code|Bytes|FL
+||rd|wr|OPS
+_
+COPY|$a[i]=b[i]$|8(+8)|8|0
+SCALE|$a[i]=q times b[i]$|8(+8)|8|1
+ADD|$a[i]=b[i]+c[i]$|16(+8)|8|1
+TRIAD|$a[i]=b[i]+q times c[i]$|16(+8)|8|2(-1)
+.TE
+.TS
+center box tab (|);
+c s s s s
+c | c | c s | c
+l | l | l | l | l .
+Stream2
+_
+Kernel|Code|Bytes|FL
+||rd|wr|OPS
+_
+FILL|$a[i]=q$|0(+8)|8|0
+COPY|$a[i]=b[i]$|8(+8)|8|0
+DAXPY|$a[i]=a[i]+q times b[i]$|16|8|2(-1)
+SUM|$sum=sum + a[i]$|8|0|1
+.TE
+.TEND "Stream operations"
+.LP
+Table \n[TABLE] is based on McCalpin's tables
+.[[
+McCalpin95
+.]]
+.[[
+McCalpin2002
+.]]
+and shows the four kernels for each version
+of the \*[stream] benchmark.  
+Note that the
+.I read
+columns include numbers in parentheses, which
+represent the average number of bytes read into 
+the cache as a result of the write to that
+variable\**.  
+.FS
+This number is independent of the cache
+line size because the STREAM uses dense
+arrays, so the cost is amortized over the
+subsequent operations on the rest of the
+line.
+.FE
+Cache lines are almost invariably
+bigger than a single double, and so when a
+write miss occurs the cache will read the line
+from memory and then modify the selected bytes.
+Sometimes vector instructions such as SSE
+and 3DNow can avoid this load by writing an 
+entire cache line at once.
+.LP
+In addition, some architectures support
+multiply-add instructions which can do
+both the multiply and add operations for
+TRIAD and DAXPY in a single operation,
+so the physical FLOPS count would be 1 
+for these architectures on these
+instructions.
+The numbers in parenthesis in the
+.I FLOPS
+column reflect this reduction in
+FLOPS count.
+.LP
+Following the STREAM bandwidth reporting
+conventions, the \*[lmbench] STREAM benchmarks 
+report their results as bandwidth results 
+(MB/s) computed as a function of the amount 
+of data explicitly read or written by the 
+benchmark.  
+For example, \fIcopy\fR and \fIscale\fR copy 
+data from one array to the other, so the 
+bandwidth is measured as a function of the 
+amount of data read plus the amount of data 
+written, or the sum of the two array sizes.
+Similarly, \fIsum\fR, \fItriad\fR, and \fIdaxpy\fR
+operate on three arrays, so the amount of data
+transferred is the sum of the sizes of the three
+arrays.
+Note that the actual amount of data that is
+transferred by the system may be larger 
+because in the write path the cache may
+need to fetch (read) the cache line before
+a portion of it is overwritten by dirty data.
+.NH 1
+Unscalable benchmarks
+.LP
+There are a number of benchmarks which either
+did not make sense for scalable load, such as
+\*[mhz], or which could not
+be extended to measure scalable load due to
+other constraints, such as \*[lat_connect].
+.LP
+\*[mhz] measures the processor clock speed,
+which is not a scalable feature of the system,
+so it doesn't make any sense to create a
+version of it that measures scalable performance.
+.LP
+More specifically, \*[lat_connect] measures
+the latency of connecting to a TCP socket.
+TCP implementations have a timeout on
+sockets and there is generally a fixed size
+queue for sockets in the TIMEOUT state.  
+This means that once the queue has been 
+filled by a program connecting and closing
+sockets as fast as possible, then all new
+socket connections have to wait TIMEOUT
+seconds.  Needless to say, this gives no
+insight into the latency of socket creation
+per se, but is rather a boring artifact.
+Since the \*[lmbench2] version of the
+benchmark can run for very short periods
+of time, it generally does not run into
+this problem and is able to correctly
+measure TCP connection latency.  
+.LP
+Any scalable version of the benchmark needs 
+each copy to run for at least a second, and 
+there are $N$ copies creating connections as 
+fast as possible, so it would essentially be
+guaranteed to run into the TIMEOUT problem.
+Consequently, \*[lat_connect] was not
+enhanced to measure scalable performance.
+.LP
+\*[lat_fs] has not yet been parallelized because
+of the difficulty in measuring file creation and
+file deletion times in the new timing harness.
+The timing harness assumes that it can ask the
+benchmarked operation to be repeated as many times
+as necessary.
+This would mean that the file creation benchmark
+could create any number of new files of a given
+size, which could well fill up the file system.
+The real problem lies in the file deletion benchmark.
+In order to delete files of a given size, they 
+must have been created before the benchmark begins.
+However, the number of files is not known in
+advance, so the benchmark would have a difficult
+time ensuring that it has created enough files.
+.LP
+The benchmarks that measure aspects of memory-subsystem 
+micro-architecture, \*[lat_dram_page], \*[line], 
+\*[par_mem], and \*[tlb], were not parallelized because 
+the multiple processes' memory access patterns would 
+likely interfere with one another.
+For example, in \*[lat_dram_page], those accesses 
+which were supposed to be to open DRAM pages could 
+well be accessing closed DRAM pages, invalidating 
+the benchmark.
+.LP
+\*[lmdd] was not parallelized because it is 
+supposed to be a clone of \*[dd], and it
+wasn't clear what a parallel form of \*[dd]
+would look like.  
+.NH 1
+Results
+.LP
+The results presented here were obtained using
+\*[lmbench] version 3.0-a2 under 
+Linux 2.4.18-6mdk on a two processor 450MHz PIII 
+running a stock Mandrake 8.2 Linux 2.4.18 kernel.
+.TSTART
+.TS
+center box tab (&);
+c | c   s
+l | l | l.
+Benchmark&Latency ($mu$s)
+_
+&1 process&2 processes
+_
+null call&0.79&0.81
+null I/O&1.39&2.39
+stat&9.26&25.9
+open/close&11.7&27.1
+select (TCP)&55.3&58.6
+signal install&1.89&1.95
+signal handler&6.34&7.21
+fork process&793.&868.
+exec process&2474&2622
+sh process&24.K&25.K
+pipe&17.7&23.3
+unix socket&51.6&37.6
+UDP&70.2&70.6
+TCP&91.2&92.3
+rpc (UDP)&120.0&120.4
+rpc (TCP)&157.1&159.1
+.TE
+.TEND "Latency results"
+.nr TABLElatency \n[TABLE]
+.TSTART
+.TS
+center box tab (&);
+c | c   s
+l | l | l.
+Benchmark&Bandwidth (MB/s)
+_
+&1 process&2 processes
+_
+pipe&155&268
+unix socket&142&179
+TCP&57.5&57.8
+bcopy(libc)&134&175
+bcopy(hand)&144&174
+memory read&319&486
+memory write&199&202
+STREAM copy&288.68&367.99
+STREAM scale&290.39&369.08
+STREAM sum&337.75&415.54
+STREAM triad&246.90&380.09
+STREAM2 fill&198.96&276.28
+STREAM2 copy&288.55&359.93
+STREAM2 daxpy&318.98&493.79
+STREAM2 sum&354.03&512.05
+.TE
+.TEND "Bandwidth results"
+.nr TABLEbandwidth \n[TABLE]
+.TSTART
+.TS
+center box tab (&);
+c | c   s   s
+l | l | l | l.
+Benchmark&Load
+_
+&1&2&2clone
+_
+bw_file_rd&151.04&266.74&273.51
+bw_mmap_rd&316.08&480.02&482.57
+lat_mmap&615&878&786
+lat_pagefault&2.9802&3.9159&3.4589
+.TE
+.TEND "File bandwidth results"
+.nr TABLEfile \n[TABLE]
+.LP
+Table \n[TABLElatency] shows the latency of
+various system and communication operations
+for both 1 and 2 process loads, while 
+Table \n[TABLEbandwidth] shows the bandwidth
+of various data operations and
+Table \n[TABLEfile] shows how various file
+system operations scale.
+Table \n[TABLEfile] shows system performance 
+with one process, two processes sharing the 
+same file, and two processes accessing their 
+own files.
+.TSTART 1
+.G1
+label left "Latency (ns)"
+label bottom "Memory size (MB)"
+coord x 0.0004,32 y 5,300 log x
+draw solid
+0.00049 6.680
+0.00098 6.683
+0.00195 6.680
+0.00293 6.680
+0.00391 6.681
+0.00586 6.681
+0.00781 6.681
+0.00977 6.684
+0.01172 6.683
+0.01367 6.690
+0.01562 6.725
+0.01758 48.977
+0.01953 49.051
+0.02148 49.043
+0.02344 49.025
+0.02539 48.889
+0.02734 48.880
+0.02930 48.902
+0.03125 49.020
+0.03516 49.043
+0.03906 48.904
+0.04297 49.044
+0.04688 49.027
+0.05078 49.046
+0.05469 48.889
+0.05859 49.018
+0.06250 49.012
+0.07031 49.025
+0.07812 49.030
+0.08594 48.936
+0.09375 49.042
+0.10156 49.022
+0.10938 48.889
+0.11719 49.073
+0.12500 48.998
+0.14062 49.043
+0.15625 49.125
+0.17188 49.160
+0.18750 49.113
+0.20312 49.123
+0.21875 48.991
+0.23438 49.045
+0.25000 49.184
+0.28125 49.971
+0.31250 57.735
+0.34375 72.668
+0.37500 79.106
+0.40625 77.612
+0.43750 78.764
+0.46875 88.636
+0.50000 104.024
+1.00000 179.817
+1.50000 182.297
+2.00000 182.043
+2.50000 182.902
+3.00000 183.130
+3.50000 184.333
+4.00000 182.868
+5.00000 183.319
+6.00000 183.208
+7.00000 183.688
+8.00000 183.871
+10.00000 183.659
+12.00000 183.583
+14.00000 183.773
+16.00000 183.828
+18.00000 183.894
+20.00000 183.933
+30.00000 183.971
+new dashed
+0.00049 6.811
+0.00098 6.815
+0.00195 6.825
+0.00293 6.807
+0.00391 6.803
+0.00586 6.822
+0.00781 6.826
+0.00977 6.825
+0.01172 6.922
+0.01367 6.825
+0.01562 6.866
+0.01758 49.954
+0.01953 49.989
+0.02148 50.021
+0.02344 50.019
+0.02539 50.003
+0.02734 50.085
+0.02930 50.000
+0.03125 50.187
+0.03516 49.988
+0.03906 50.032
+0.04297 49.986
+0.04688 50.186
+0.05078 50.196
+0.05469 50.107
+0.05859 50.087
+0.06250 49.983
+0.07031 50.092
+0.07812 50.135
+0.08594 50.057
+0.09375 50.188
+0.10156 65.950
+0.10938 55.614
+0.11719 54.328
+0.12500 61.700
+0.14062 59.710
+0.15625 52.637
+0.17188 82.911
+0.18750 74.304
+0.20312 72.371
+0.21875 78.124
+0.23438 74.577
+0.25000 96.374
+0.28125 110.708
+0.31250 97.832
+0.34375 103.006
+0.37500 129.292
+0.40625 140.816
+0.43750 165.255
+0.46875 164.632
+0.50000 170.912
+1.00000 233.968
+1.50000 285.445
+2.00000 241.341
+2.50000 263.436
+3.00000 273.101
+3.50000 269.926
+4.00000 233.626
+5.00000 222.305
+6.00000 293.832
+7.00000 238.863
+8.00000 245.026
+10.00000 282.297
+12.00000 239.152
+14.00000 274.218
+16.00000 226.299
+18.00000 284.183
+20.00000 224.596
+30.00000 236.416
+"1 process" at 5,165
+"2 processes" at 0.3,280
+.G2
+.FEND "Memory subsystem performance" 1
+.nr FIGUREmem \n[FIGURE]
+.LP
+Figure \n[FIGUREmem] shows the memory latency
+curves with 32 byte strides for one and two 
+process loads versus memory size.
+.NH 1
+Conclusion
+.LP
+\*[lmbench] is a useful, portable micro-benchmark 
+suite designed to measure important aspects of 
+system performance.
+\*[lmbench3] adds a number of important extensions,
+such as the ability to measure system scalability.
+.LP
+The benchmarks are available via ftp from:
+.IP
+.I "http://ftp.bitmover.com/lmbench";
+.NH 1
+Acknowledgments
+.LP
+Many people have provided invaluable help and insight into the benchmarks.
+We especially thank:
+Eric Anderson \s-1(HP)\s0,
+Bruce Chapman \s-1(SUN)\s0,
+Larry McVoy \s-1(BitMover)\s0,
+David Mosberger \s-1(HP)\s0,
+Wayne Scott \s-1(BitMover)\s0,
+John Wilkes \s-1(HP)\s0,
+and
+Mitch Wright \s-1(HP)\s0.
+.LP
+We would also like to thank all of the people that have run the
+benchmark and contributed their results; none of this would have been possible
+without their assistance.
+.LP
+Our thanks to 
+all of the free software community for tools that were used during this
+project.
+.\" .R1
+.\" bibliography references-lmbench3
+.\" .R2
+.\"********************************************************************
+.\" Redefine the IP paragraph format so it won't insert a useless line
+.\" break when the paragraph tag is longer than the indent distance
+.\"
+.de @IP
+.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2)
+.par*start \\n[\\n[.ev]:ai] 0
+.if !'\\$1'' \{\
+.	\" Divert the label so as to freeze any spaces.
+.	di par*label
+.	in 0
+.	nf
+\&\\$1
+.	di
+.	in
+.	fi
+.	chop par*label
+.	ti -\\n[\\n[.ev]:ai]u
+.	ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c
+.	el \{\
+\\*[par*label]
+.\".	br
+.	\}
+.	rm par*label
+.\}
+..
+.\"********************************************************************
+.\" redefine the way the reference tag is printed so it is enclosed in
+.\" square brackets
+.\"
+.de ref*end-print
+.ie d [F .IP "[\\*([F]" 2
+.el .XP
+\\*[ref*string]
+..
+.\"********************************************************************
+.\" Get journal number entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-N
+.ref*field N "" ( ) 
+..
+.\"********************************************************************
+.\" Get journal volume entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-V
+.ref*field V , "" "" ""
+..
+.\"********************************************************************
+.\" Get the date entry right.  Should not be enclosed in parentheses.
+.\"
+.de ref*add-D
+.ref*field D ","
+..
+.\" References
+.[
+$LIST$
+.]
+.\" .so bios
diff --git a/performance/lmbench3/doc/lmbench3_arch.fig b/performance/lmbench3/doc/lmbench3_arch.fig
new file mode 100644
index 0000000..36274db
--- /dev/null
+++ b/performance/lmbench3/doc/lmbench3_arch.fig
@@ -0,0 +1,119 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 900 1425 2100 2400
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 900 1950 2100 1950
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 900 2025 2100 2025
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1350 1950 1350 2100
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1500 1950 1500 2100
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1650 1950 1650 2100
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1800 1950 1800 2100
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1950 1950 1950 2100
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1200 1950 1200 2100
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1050 1950 1050 2100
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 900 1425 2100 1425 2100 2400 900 2400 900 1425
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 900 2100 2100 2100
+4 0 0 50 0 0 12 0.0000 4 135 480 1275 1575 Cache\001
+-6
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 150 525 3450 525
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 300 750 300 525
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 600 750 600 525
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 900 750 900 525
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 150 2625 2250 2625
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 3075 75 3450 75 3450 300 3075 300 3075 75
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 2550 300 2550 525
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 2400 75 2775 75 2775 300 2400 300 2400 75
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 1950 75 2325 75 2325 300 1950 300 1950 75
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 3225 300 3225 525
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 2100 300 2100 525
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 225 75 825 75 825 300 225 300 225 75
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 975 75 1575 75 1575 300 975 300 975 75
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1275 300 1275 525
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 525 300 525 525
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 600 2775 1800 2775 1800 3450 600 3450 600 2775
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1125 2625 1125 2775
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 2775 525 2775 750
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1950 750 1950 525
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 4
+	 2775 975 2775 1275 1500 1275 1500 1425
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 2925 1350 3375 1350 3375 1575 2925 1575 2925 1350
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 3
+	 3000 900 3150 900 3150 1350
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 3
+	 3150 1575 3150 1725 2100 1725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 898 1940 675 1875
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 4
+	 2175 1950 2250 1950 2250 2100 2175 2100
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 2475 2025 2250 2025
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1500 2400 1500 2625
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 2625 750 3000 750 3000 975 2625 975 2625 750
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 1875 750 2250 750 2250 975 1875 975 1875 750
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 1125 750 1500 750 1500 975 1125 975 1125 750
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 675 750 1050 750 1050 975 675 975 675 750
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 225 750 600 750 600 975 225 975 225 750
+4 0 0 50 0 0 12 0.0000 4 135 375 300 225 ALU\001
+4 0 0 50 0 0 12 0.0000 4 135 270 75 450 bus\001
+4 0 0 50 0 0 12 0.0000 4 135 150 3150 225 fn\001
+4 0 0 50 0 0 12 0.0000 4 15 135 2850 225 ...\001
+4 0 0 50 0 0 12 0.0000 4 180 1710 1725 450 floating point registers\001
+4 0 0 50 0 0 12 0.0000 4 135 150 2475 225 f1\001
+4 0 0 50 0 0 12 0.0000 4 135 150 2025 225 f0\001
+4 0 0 50 0 0 12 0.0000 4 135 345 1050 225 FPU\001
+4 0 0 50 0 0 12 0.0000 4 135 600 900 2925 memory\001
+4 0 0 50 0 0 12 0.0000 4 135 300 2700 900 MA\001
+4 0 0 50 0 0 12 0.0000 4 180 1500 1350 1275 physical addressing\001
+4 0 0 50 0 0 12 0.0000 4 135 765 150 1875 cache line\001
+4 0 0 50 0 0 12 0.0000 4 180 900 2550 2025 set (2-way)\001
+4 0 0 50 0 0 12 0.0000 4 135 330 3000 1500 TLB\001
+4 0 0 50 0 0 12 0.0000 4 180 915 2325 2625 memory bus\001
+4 0 0 50 0 0 12 0.0000 4 90 150 1950 900 rn\001
+4 0 0 50 0 0 12 0.0000 4 15 135 1575 900 ...\001
+4 0 0 50 0 0 12 0.0000 4 135 150 1200 900 r2\001
+4 0 0 50 0 0 12 0.0000 4 135 150 750 900 r1\001
+4 0 0 50 0 0 12 0.0000 4 135 150 300 900 r0\001
+4 0 0 50 0 0 12 0.0000 4 180 1245 975 675 integer registers\001
diff --git a/performance/lmbench3/doc/lmbench3_signals.fig b/performance/lmbench3/doc/lmbench3_signals.fig
new file mode 100644
index 0000000..12e9bb1
--- /dev/null
+++ b/performance/lmbench3/doc/lmbench3_signals.fig
@@ -0,0 +1,95 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 225 1575 1050 2025
+2 2 2 1 0 0 50 0 10 3.000 0 0 -1 0 0 5
+	 225 1800 375 1800 375 1950 225 1950 225 1800
+2 2 2 1 0 0 50 0 3 3.000 0 0 -1 0 0 5
+	 225 1575 375 1575 375 1725 225 1725 225 1575
+4 0 0 50 0 0 12 0.0000 4 180 600 450 1725 working\001
+4 0 0 50 0 0 12 0.0000 4 180 465 450 1950 timing\001
+-6
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 2025 300 2025 1725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 2250 300 2250 1575
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 750 525 2250 525
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 750 825 2250 825
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 1575 675 750 675
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 1800 975 750 975
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 750 1125 2250 1125
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1800 300 1800 1875
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 3000 600 2250 600
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 1575 675 3000 675
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 3000 900 2250 900
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 1800 1200 3000 1200
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 3000 1275 2250 1275
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 2025 1350 3000 1350
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1500 750 1650 600
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1950 1425 2100 1275
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 1500 75 2325 75 2325 300 1500 300 1500 75
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2
+	 1575 300 1575 2025
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+	1 1 1.00 60.00 120.00
+	 2025 1350 750 1350
+2 2 2 1 0 0 50 0 3 3.000 0 0 -1 0 0 5
+	 150 525 750 525 750 975 150 975 150 525
+2 2 2 1 0 0 50 0 10 3.000 0 0 -1 0 0 5
+	 150 675 750 675 750 825 150 825 150 675
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 150 225 750 225 750 1350 150 1350 150 225
+2 2 2 1 0 0 50 0 3 3.000 0 0 -1 0 0 5
+	 3000 600 3600 600 3600 1200 3000 1200 3000 600
+2 2 2 1 0 0 50 0 10 3.000 0 0 -1 0 0 5
+	 3000 675 3600 675 3600 900 3000 900 3000 675
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+	 3000 225 3600 225 3600 1425 3000 1425 3000 225
+4 0 0 50 0 0 12 0.0000 4 150 480 1650 225 parent\001
+4 0 0 50 0 1 12 0.0000 4 180 420 900 525 ready\001
+4 0 0 50 0 1 12 0.0000 4 135 360 900 825 done\001
+4 0 0 50 0 1 12 0.0000 4 135 495 1200 975 results\001
+4 0 0 50 0 1 12 0.0000 4 180 1020 825 1125 timing results\001
+4 0 0 50 0 0 12 0.0000 4 135 450 3075 375 child1\001
+4 0 0 50 0 1 12 0.0000 4 135 495 2325 1200 results\001
+4 0 0 50 0 1 12 0.0000 4 180 420 2550 600 ready\001
+4 0 0 50 0 1 12 0.0000 4 135 360 2550 900 done\001
+4 0 0 50 0 1 12 0.0000 4 135 165 1350 675 go\001
+4 0 0 50 0 1 12 0.0000 4 135 300 1275 1350 exit\001
+4 0 0 50 0 0 12 0.0000 4 105 360 1650 2025 start\001
+4 0 0 50 0 0 12 0.0000 4 135 690 2325 1575 response\001
+4 0 0 50 0 0 12 0.0000 4 135 285 2100 1725 exit\001
+4 0 0 50 0 0 12 0.0000 4 135 435 1875 1875 result\001
+4 0 0 50 0 0 12 0.0000 4 135 450 225 375 child0\001
diff --git a/performance/lmbench3/doc/lmdd.8 b/performance/lmbench3/doc/lmdd.8
new file mode 100644
index 0000000..fdb888c
--- /dev/null
+++ b/performance/lmbench3/doc/lmdd.8
@@ -0,0 +1,146 @@
+.\" $Id: lmdd.8 1.1 94/11/18 01:26:35-08:00 lm@xxxxxxxxxxxxxxx $
+.TH LMDD 8 "$Date: 94/11/18 01:26:35-08:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+.SH NAME
+lmdd \- move io for performance and debugging tests
+.SH SYNOPSIS
+.B lmdd
+[
+.IB option = value
+] .\|.\|.
+.SH DESCRIPTION
+.B lmdd
+copies a specified input file to a specified output with possible
+conversions.  This program is primarily useful for timing I/O since it
+prints out the timing statistics after completing.
+.SH OPTIONS
+.TP 15
+.BI if= name
+Input file is taken from
+.IR name ;
+.I internal
+is the default.
+.I internal
+is a special file that acts like Sun's 
+.IR /dev/zero ,
+i.e., it provides a buffer of zeros without doing a system call to get them.
+.sp .5
+The following file names are taken to mean the standard input:
+.IR - ,
+.IR 0 ,
+or
+.IR stdin .
+.TP 
+.BI of= name
+Output file is taken from
+.IR name ;
+.I internal
+is the default.
+.I internal
+is a special file that acts like 
+.IR /dev/null ,
+without doing a system call to get rid of the data.
+.sp .5
+The following file names are taken to mean the standard output:
+.IR - ,
+.IR 1 ,
+or
+.IR stdout .
+.sp .5
+The following file names are taken to mean the standard error:
+.IR 2 ,
+or
+.IR stderr .
+.TP 
+.BI bs= n
+Input and output block size
+.I n
+bytes (default 8192).  Note that this is different from dd(1), it has
+a 512 byte default.   Also note that the block size can be followed
+by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024),
+respectively.
+.TP 
+.BI ipat= n
+If 
+.B n
+is non zero, expect a known pattern in the file (see opat).  Mismatches
+will be displayed as "ERROR: off=%d want=%x got=%x".  The pattern is
+a sequence of 4 byte integers with the first 0, second 1, and so on.
+The default is not to check for the pattern.
+.TP
+.BI opat= n
+If 
+.B n
+is non zero, generate a known pattern on the output stream.  Used for
+debugging file system correctness.
+The default is not to generate the pattern.
+.TP 
+.BI mismatch= n
+If 
+.B n
+is non zero, stop at the first mismatched value.  Used with ipat.
+.TP 
+.BI skip= n
+Skip
+.IR n ""
+input blocks before starting copy.
+.TP 
+.BI fsync= n
+If
+.I n
+is non-zero, call fsync(2) on the output file before exiting or printing
+timing statistics.
+.TP 
+.BI sync= n
+If
+.I n
+is non-zero, call sync(2) before exiting or printing
+timing statistics.
+.TP 
+.BI rand= n
+This argument, by default off, turns on random behavior.  The argument is
+not a flag, it is a size, that size is used as the upper bound for the 
+seeks.
+Also note that the block size can be followed
+by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024),
+.TP 
+.BI flush= n
+If
+.I n
+is non-zero and mmap(2) is available, call msync(2) to invalidate the
+output file.  This flushes the file to disk so that you don't have
+unmount/mount.  It is not as good as mount/unmount because it just
+flushes file pages - it misses the indirect blocks which are still
+cached.  Not supported on all systems, compile time option.
+.TP 
+.BI rusage= n
+If
+.I n
+is non-zero, print rusage statistics as well as timing statistics.
+Not supported on all systems, compile time option.
+.TP 
+.BI count= n
+Copy only
+.IR n ""
+input records.
+.SH EXAMPLES
+.LP
+This is the most common usage, the intent is to measure disk performance.
+The disk is a spare partition mounted on /spare.
+.sp
+.nf
+.in +4
+# mount /spare
+# lmdd if=internal of=/spare/XXX count=1000 fsync=1
+7.81 MB in 3.78 seconds (2.0676 MB/sec)
+
+: Flush cache
+# umount /spare
+# mount /spare
+
+# lmdd if=/spare/XXX of=internal 
+7.81 MB in 2.83 seconds (2.7611 MB/sec)
+.in
+.sp
+.fi
+.SH AUTHOR
+Larry McVoy, lm@xxxxxxx
diff --git a/performance/lmbench3/doc/mem.pic b/performance/lmbench3/doc/mem.pic
new file mode 100644
index 0000000..a8b5971
--- /dev/null
+++ b/performance/lmbench3/doc/mem.pic
@@ -0,0 +1,2337 @@
+.PS
+.ps 8
+.vs 11
+.ft CB
+[
+# Variables, tweak these.
+	xtick = 2.000000			# width of an X tick
+	xlower = 8.000000			# where the xtick start
+	xupper = 24.000000			# upper range of graph
+	xn = 8					# number of ticks to do
+	ytick = 50.000000			# width of an Y tick
+	ylower = 0.000000			# where the ytick start
+	yupper = 500.000000			# upper range of graph
+	yn = 10					# number of ticks to do
+	xsize = 1.75				# width of the graph
+	ysize = 1.75				# height of the graph
+	yscale = ysize / (yupper - ylower)	# scale data to paper
+	xscale = xsize / (xupper - xlower)	# scale data to paper
+	tick = 0.10000000000000000555				# distance towards numbers
+	gthk = .1				# thickness of grid lines
+	thk = .75				# thickness of data lines
+	qthk = 2.0				# thickness of quartile lines
+	vs = .15				# works for 10 point fonts
+
+# Draw the graph borders and tick marks
+	O:	box  thick 1.5 ht ysize wid xsize
+	j = ylower
+	t = tick * .5
+	for i = 0 to yn by 1 do {
+		ys = j - ylower
+		g = ys * yscale
+		line thick 1.5 from O.sw + (-tick, g) to O.sw + (0, g)
+		
+		if (i < yn) then {
+			y2 = (ys + (ytick / 2)) * yscale
+			line thick .5 from O.sw + (-t, y2) to O.sw + (0, y2)
+		}
+		if (yupper - ylower > 999) then {
+			sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02)
+		} else { if (yupper - ylower > 10) then {
+			sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02)
+		} else { if (yupper - ylower > 1) then {
+			sprintf("%.1f", j) rjust at O.sw + (-.2, g - .02)
+		} else {
+			sprintf("%.2f", j) rjust at O.sw + (-.2, g - .02)
+		}}}
+		j = j + ytick
+	}
+	j = xlower
+	for i = 0 to xn by 1 do {
+		xs = j - xlower
+		g = xs * xscale
+		line thick 1.5 from O.sw + (g, -tick) to O.sw + (g, 0)
+		
+		if (i < xn) then {
+			x2 = (xs + (xtick / 2)) * xscale
+			line thick .5 from O.sw + (x2, 0) to O.sw + (x2, -t)
+		}
+		if (xupper - xlower > 999) then {
+			sprintf("%.0f", j) at O.sw + (g, -.25)
+		} else { if (xupper - xlower > 10) then {
+			sprintf("%.0f", j) at O.sw + (g, -.25)
+		} else { if (xupper - xlower > 1) then {
+			sprintf("%.1f", j) at O.sw + (g, -.25)
+		} else {
+			sprintf("%.2f", j) at O.sw + (g, -.25)
+		}}}
+		j = j + xtick
+	}
+
+# DATASET: stride=8, MARK 0
+[ "\(ci" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (18 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (23 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (27 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (29 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (29 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (29 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (29 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (29 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (31 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (30 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (31 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (31 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (31 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (31 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (31 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (32 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (32 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (33 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (32 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (33 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (34 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (34 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (34 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (35 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (35 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (36 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (36 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (37 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (46 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (40 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (39 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (89 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (91 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (92 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (91 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (92 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (89 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (92 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (92 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (91 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (92 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (92 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (92 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (90 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (92 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (92 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(ci" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (92 - ylower))
+
+# DATASET: stride=16, MARK 1
+[ "\(sq" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (26 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (36 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (44 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (49 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (49 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (48 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (49 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (49 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (49 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (49 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (51 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (49 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (51 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (51 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (52 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (51 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (52 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (51 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (52 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (52 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (52 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (52 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (53 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (53 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (57 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (57 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (58 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (59 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (60 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (65 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (67 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (67 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (164 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (165 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (167 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (165 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (165 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (168 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (167 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (166 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (165 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (168 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (168 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (167 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (167 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (166 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (167 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(sq" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (167 - ylower))
+
+# DATASET: stride=32, MARK 2
+[ "\(*D" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (28 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (40 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (57 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (57 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (58 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (57 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (57 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (57 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (60 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (61 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (61 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (62 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (61 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (62 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (63 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (65 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (64 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (67 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (68 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (70 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (83 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (85 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (87 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (335 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (335 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (336 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (335 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (339 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (337 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (338 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (336 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (337 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (335 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (338 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (339 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (336 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (340 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(*D" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (340 - ylower))
+
+# DATASET: stride=64, MARK 3
+[ "\(mu" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (28 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (40 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (49 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (51 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (57 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (57 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (58 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (58 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (61 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (61 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (61 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (62 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (63 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (63 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (76 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (78 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (79 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (323 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (326 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (325 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (326 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (326 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (328 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (326 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (326 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(mu" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (327 - ylower))
+
+# DATASET: stride=128, MARK 4
+[ "\s+4\(bu\s0" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (27 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (59 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (60 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (60 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (60 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (61 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (61 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (71 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (75 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (75 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (317 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (322 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (320 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (320 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (320 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (319 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (320 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (320 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (323 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\s+4\(bu\s0" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (321 - ylower))
+
+# DATASET: stride=512, MARK 5
+[ box ht .07 wid .07 fill 1 ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (28 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (53 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (50 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (67 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (67 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (67 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (67 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (67 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (67 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (77 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (74 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (80 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (317 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (320 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (320 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (320 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (320 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (322 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (321 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (322 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (322 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (322 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box ht .07 wid .07 fill 1 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (322 - ylower))
+
+# DATASET: stride=1024, MARK 6
+[ "\s+2\(pl\s0" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (27 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (53 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (78 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (78 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (78 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (78 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (78 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (78 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (88 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (91 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (91 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (324 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (325 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (326 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (328 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (326 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (326 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (328 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (327 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (326 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\s+2\(pl\s0" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (326 - ylower))
+
+# DATASET: stride=2048, MARK 7
+[ "\s+4\(**\s0" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (27 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (40 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (100 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (100 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (100 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (100 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (100 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (100 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (111 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (115 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (114 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (340 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (340 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (343 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (344 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (343 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (343 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (345 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (343 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (344 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (344 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (344 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (344 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (345 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (345 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (345 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\s+4\(**\s0" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (345 - ylower))
+
+# DATASET: stride=4096, MARK 0
+[ "\(ci" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (39 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (56 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (147 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (146 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (146 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (145 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (145 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (145 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (157 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (162 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (160 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (379 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (380 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (378 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (380 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (382 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (381 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (381 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (381 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (382 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (382 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (382 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (382 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (383 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (383 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (385 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(ci" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (385 - ylower))
+
+# DATASET: stride=8192, MARK 1
+[ "\(sq" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (232 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (231 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (231 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (232 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (232 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (230 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (240 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (246 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (246 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (445 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (449 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (441 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (449 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (449 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (450 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (451 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (443 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (441 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (442 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (446 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (452 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (452 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (453 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (453 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(sq" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (453 - ylower))
+
+# DATASET: stride=16384, MARK 2
+[ "\(*D" ] at O.sw + \
+	(xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower))
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.322574277531574083 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.585116379985436197 - xlower), yscale * (11 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (13.807157053169248684 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.000461588562853166 - xlower), yscale * (10 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.807157053169248684 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.170078880706594049 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.585116379985436197 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.700404205210695352 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.1698737047066885 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.322020424415466522 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.584962500721157852 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.700404205210695352 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (16.906921372774437629 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.700475230197337595 - xlower), yscale * (53 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.169925001442312151 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.32192809488736529 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.459431618637296424 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.584962500721157852 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.70043971814109085 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.807354922057605506 - xlower), yscale * (55 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (18.90689059560851959 - xlower), yscale * (72 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (19 - xlower), yscale * (243 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20 - xlower), yscale * (432 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (20.584962500721154299 - xlower), yscale * (445 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21 - xlower), yscale * (445 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.321928094887361738 - xlower), yscale * (447 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.584962500721154299 - xlower), yscale * (448 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (21.807354922057605506 - xlower), yscale * (448 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22 - xlower), yscale * (449 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.169925001442312151 - xlower), yscale * (450 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.321928094887361738 - xlower), yscale * (447 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.459431618637296424 - xlower), yscale * (450 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.584962500721157852 - xlower), yscale * (449 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.70043971814109085 - xlower), yscale * (449 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.807354922057605506 - xlower), yscale * (449 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (22.90689059560851959 - xlower), yscale * (452 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ box invis ht .05 wid .05 ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (452 - ylower))
+line thick thk from 2nd last [].c to last [].c
+[ "\(*D" ] at O.sw + \
+	(xscale * (23 - xlower), yscale * (452 - ylower))
+
+# DATASET: stride=16384, MARK 3
+
+# DATASET: stride=16384, MARK 4
+
+.ps 8
+.vs 8
+"8KB" "cache" at O.sw + .35,.32
+arrow thick 2 wid .07 down .15 from O.sw + .35,.20
+".5MB" "cache" at O.sw + .85,.50
+arrow thick 2 wid .07 down .15 from O.sw + .85,.38
+"Main" "mem" at O.e - .25,.15
+arrow thick 2 wid .07 up .15 from O.e - .25,0
+.vs
+.ps
+
+# Xaxis title.
+"\s+2log2(Array size)\s0" rjust at O.se - (0, .6)
+
+# Yaxis title (Latency in nanoseconds)
+.ps +2
+.vs -1
+"L" "a" "t" "e" "n" "c" "y" " " "i" "n" at O.w - (.95, 0)
+"n" "a" "n" "o" "s" "e" "c" "o" "n" "d" "s" at O.w - (.75, 0)
+.ps
+.vs
+
+# Graph title.
+"\s+2DEC alpha@182mhz memory latencies\s0" at O.n + (-.5, .3)
+
+# Title.
+#[ "\(ci" ] at O.ne + (.25, - 0 * vs)
+#"stride=8" ljust at last [].e + (.1, 0)
+#[ "\(sq" ] at O.ne + (.25, - 1 * vs)
+#"stride=16" ljust at last [].e + (.1, 0)
+#[ "\(*D" ] at O.ne + (.25, - 2 * vs)
+#"stride=32" ljust at last [].e + (.1, 0)
+#[ "\(mu" ] at O.ne + (.25, - 3 * vs)
+#"stride=64" ljust at last [].e + (.1, 0)
+#[ "\s+4\(bu\s0" ] at O.ne + (.25, - 4 * vs)
+#"stride=128" ljust at last [].e + (.1, 0)
+#[ box ht .07 wid .07 fill 1 ] at O.ne + (.25, - 5 * vs)
+#"stride=512" ljust at last [].e + (.1, 0)
+#[ "\s+2\(pl\s0" ] at O.ne + (.25, - 6 * vs)
+#"stride=1024" ljust at last [].e + (.1, 0)
+#[ "\s+4\(**\s0" ] at O.ne + (.25, - 7 * vs)
+#"stride=2048" ljust at last [].e + (.1, 0)
+#[ "\(ci" ] at O.ne + (.25, - 8 * vs)
+#"stride=4096" ljust at last [].e + (.1, 0)
+#[ "\(sq" ] at O.ne + (.25, - 9 * vs)
+#"stride=8192" ljust at last [].e + (.1, 0)
+#[ "\(*D" ] at O.ne + (.25, - 10 * vs)
+#"stride=16384" ljust at last [].e + (.1, 0)
+]
+.ft
+.ps
+.PE
diff --git a/performance/lmbench3/doc/memhier-color.d b/performance/lmbench3/doc/memhier-color.d
new file mode 100644
index 0000000..50a3cef
--- /dev/null
+++ b/performance/lmbench3/doc/memhier-color.d
@@ -0,0 +1,86 @@
+frame invis ht 1.5 wid 2.5 left solid bot solid
+label top "\fBalpha Linux 2.2.16-3\fR" down 0.3
+label bot "Size (MB)"
+label left "Latency (ns)"
+coord log log
+ticks bottom out at   0.000512 "512",   0.001024 "",   0.002048 "",   0.004096 "",   0.008192 "8K",   0.016384 "",   0.032768 "",   0.065536 "",   0.098304 "96K",   0.131072 "",   0.262144 "",   0.524288 "",   1.048576 "1M",   2.097152 "",   4.194304 "",   8.388608 "",  16.777216 "",  33.554432 "32M"
+draw dotted
+  0.000512    4.042
+  0.008192    4.046
+  0.010240    8.873
+  0.012288   12.085
+  0.016384   16.097
+  0.032768   16.103
+  0.065536   19.908
+  0.098304   20.622
+  0.114688   29.808
+  0.131072   37.724
+  0.196608   47.561
+  0.262144   52.134
+  0.524288   66.410
+  1.048576   74.897
+  1.310720  153.075
+  1.572864  198.678
+  2.097152  264.935
+  3.145728  333.862
+  4.194304  366.109
+  8.388608  370.522
+ 33.554432  370.682
+"Colored" ljust at   1.572864, 222.789
+draw solid
+  0.000512    4.042
+  0.000640    4.043
+  0.000768    4.044
+  0.000896    4.043
+  0.001024    4.043
+  0.001280    4.044
+  0.001536    4.044
+  0.001792    4.044
+  0.002048    4.041
+  0.002560    4.044
+  0.003072    4.045
+  0.003584    4.044
+  0.004096    4.045
+  0.005120    4.046
+  0.006144    4.047
+  0.007168    4.048
+  0.008192    4.048
+  0.010240    8.872
+  0.012288   12.079
+  0.014336   14.379
+  0.016384   16.097
+  0.020480   16.104
+  0.024576   16.117
+  0.028672   16.114
+  0.032768   16.106
+  0.040960   16.110
+  0.049152   16.123
+  0.057344   18.062
+  0.065536   19.179
+  0.081920   97.039
+  0.098304   84.011
+  0.114688   81.764
+  0.131072   79.122
+  0.163840   82.634
+  0.196608  108.550
+  0.229376  104.530
+  0.262144  119.771
+  0.327680  111.317
+  0.393216  131.057
+  0.458752  143.902
+  0.524288  173.323
+  0.655360  197.268
+  0.786432  219.736
+  0.917504  224.743
+  1.048576  249.878
+  1.310720  287.157
+  1.572864  302.857
+  1.835008  315.170
+  2.097152  329.874
+  2.621440  347.418
+  3.145728  357.183
+  3.670016  362.297
+  4.194304  365.720
+  5.242880  369.345
+ 33.554432  370.296
+"Malloc'ed" rjust at   0.458752, 219.736
diff --git a/performance/lmbench3/doc/memhier-line.d b/performance/lmbench3/doc/memhier-line.d
new file mode 100644
index 0000000..4bb890e
--- /dev/null
+++ b/performance/lmbench3/doc/memhier-line.d
@@ -0,0 +1,34 @@
+frame invis ht 1.5 wid 2.5 left solid bot solid
+label top "\fBalpha Linux 2.2.16-3\fR" down 0.3
+label bot "Line Size (Bytes)"
+label left "Latency (ns)"
+coord log log
+ticks bottom out from 8 to 512 by *4
+ticks bottom out from 8 to 512 by *2 ""
+draw solid
+8    7.247
+16   10.909
+32   16.788
+64   17.083
+128   16.272
+256   16.721
+512   16.129
+"L1" rjust above at 512,  16.129
+draw solid
+8   22.853
+16   41.496
+32   78.712
+64  141.658
+128  139.119
+256  138.446
+512  137.902
+"L2" rjust above at 512, 137.902
+draw solid
+8   51.529
+16   98.915
+32  193.614
+64  372.230
+128  371.689
+256  371.486
+512  371.486
+"L3" rjust above at 512, 371.486
diff --git a/performance/lmbench3/doc/memhier-tlb.d b/performance/lmbench3/doc/memhier-tlb.d
new file mode 100644
index 0000000..908e840
--- /dev/null
+++ b/performance/lmbench3/doc/memhier-tlb.d
@@ -0,0 +1,407 @@
+frame invis ht 1.5 wid 2.5 left solid bot solid
+label top "\fBalpha Linux 2.2.16-3\fR" down 0.3
+label bot "Pages"
+label left "Latency (ns)"
+coord log log
+draw dotted
+1    4.042
+2    4.047
+3    4.043
+4    4.044
+5    4.043
+6    4.043
+7    4.045
+8    4.044
+9    4.044
+10    4.044
+11    4.044
+12    4.044
+13    4.044
+14    4.044
+15    4.045
+16    4.046
+17    4.047
+18    4.046
+19    4.046
+20    4.047
+21    4.048
+22    4.046
+23    4.047
+24    4.047
+25    4.048
+26    4.048
+27    4.048
+28    4.048
+29    4.048
+30    4.049
+31    4.048
+32    4.049
+33    4.049
+34    4.049
+35    4.049
+36    4.049
+37    4.049
+38    4.049
+39    4.071
+40    4.070
+41    4.070
+42    4.070
+43    4.070
+44    4.070
+45    4.070
+46    4.069
+47    4.070
+48    4.070
+49    4.071
+50    4.070
+51    4.070
+52    4.069
+53    4.048
+54    4.049
+55    4.069
+56    4.049
+57    4.049
+58    4.070
+59    4.048
+60    4.050
+61    4.070
+62    4.050
+63    4.048
+64    4.066
+65    4.048
+66    4.050
+67    4.069
+68    4.048
+69    4.049
+70    4.069
+71    4.049
+72    4.049
+73    4.069
+74    4.071
+75    4.071
+76    4.069
+77    4.071
+78    4.071
+79    4.069
+80    4.069
+81    4.069
+82    4.069
+83    4.069
+84    4.070
+85    4.070
+86    4.069
+87    4.070
+88    4.070
+89    4.071
+90    4.071
+91    4.070
+92    4.070
+93    4.072
+94    4.070
+95    4.049
+96    4.049
+97    4.070
+98    4.049
+99    4.050
+100    4.071
+101    4.050
+102    4.048
+103    4.049
+104    4.048
+105    4.048
+106    4.048
+107    4.049
+108    4.048
+109    4.048
+110    4.048
+111    4.048
+112    4.048
+113    4.051
+114    4.048
+115    4.069
+116    4.050
+117    4.048
+118    4.048
+119    4.048
+120    4.054
+121    4.054
+122    4.048
+123    4.050
+124    4.049
+125    4.048
+126    4.049
+127    4.048
+128    4.049
+129    4.260
+130    4.446
+131    4.647
+132    4.802
+133    4.978
+134    5.148
+135    5.321
+136    5.490
+137    5.653
+138    5.816
+139    5.980
+140    6.138
+141    7.370
+256    7.068
+"Packed" rjust above at 246,   7.370
+draw solid
+1    4.042
+2    4.042
+3    4.042
+4    4.042
+5    4.042
+6    4.043
+7    4.042
+8    4.042
+9    4.042
+10    4.042
+11    4.042
+12    4.042
+13    4.043
+14    4.042
+15    4.041
+16    4.042
+17    4.043
+18    4.042
+19    4.042
+20    4.043
+21    4.043
+22    4.046
+23    4.044
+24    4.043
+25    4.043
+26    4.044
+27    4.042
+28    4.041
+29    4.044
+30    4.043
+31    4.044
+32    4.044
+33    4.044
+34    4.044
+35    4.044
+36    4.045
+37    4.044
+38    4.044
+39    4.044
+40    4.042
+41    4.043
+42    4.042
+43    4.044
+44    4.044
+45    4.044
+46    4.045
+47    4.044
+48    4.051
+49    4.044
+50    4.044
+51    4.043
+52    4.042
+53    4.045
+54    4.044
+55    4.042
+56    4.044
+57    4.049
+58    4.046
+59    4.045
+60    4.045
+61    4.045
+62    4.047
+63    4.045
+64   39.263
+65   39.209
+66   39.163
+67   39.488
+68   39.473
+69   39.752
+70   39.710
+71   39.651
+72   39.605
+73   39.606
+74   39.522
+75   47.264
+76   39.490
+77   40.007
+78   39.945
+79   39.900
+80   39.891
+81   47.525
+82   39.819
+83   40.051
+84   39.993
+85   40.556
+86   40.487
+87   40.470
+88   40.396
+89   40.623
+90   40.565
+91   40.497
+92   41.640
+93   53.333
+94   40.866
+95   40.823
+96   46.649
+97   40.723
+98   40.739
+99   40.896
+100   40.826
+101   41.257
+102   41.462
+103   41.192
+104   41.150
+105   41.309
+106   41.267
+107   41.471
+108   46.722
+109   41.819
+110   41.742
+111   46.823
+112   41.691
+113   41.592
+114   41.554
+115   41.736
+116   41.712
+117   46.795
+118   43.811
+119   41.940
+120   52.439
+121   42.053
+122   42.025
+123   43.049
+124   42.302
+125   42.431
+126   42.403
+127   42.346
+128   42.496
+129   43.304
+130   42.394
+131   42.591
+132   43.344
+133   46.852
+134   43.398
+135   47.048
+136   43.622
+137   46.991
+138   42.750
+139   42.892
+140   43.915
+141   47.368
+142   52.607
+143   46.635
+144   43.154
+145   43.198
+146   43.866
+147   43.205
+148   47.229
+149   44.179
+150   47.845
+151   44.228
+152   45.044
+153   47.489
+154   44.559
+155   52.694
+156   44.713
+157   48.325
+158   43.963
+159   47.580
+160   53.114
+161   48.816
+162   48.765
+163   46.131
+164   49.539
+165   51.761
+166   48.149
+167   49.600
+168   44.871
+169   49.938
+170   47.790
+171   47.698
+172   48.453
+173   45.148
+174   55.011
+175   45.250
+176   45.917
+177   51.219
+178   48.819
+179   45.335
+180   48.083
+181   58.405
+182   48.727
+183   46.855
+184   46.712
+185   54.348
+186   46.814
+187   48.785
+188   49.653
+189   51.982
+190   51.728
+191   46.027
+192   52.139
+193   53.446
+194   46.605
+195   52.417
+196   52.008
+197   47.167
+198   50.892
+199   54.935
+200   46.870
+201   48.752
+202   46.438
+203   50.100
+204   48.546
+205   49.406
+206   48.250
+207   48.192
+208   49.371
+209   50.398
+210   52.615
+211   49.973
+212   58.927
+213   51.122
+214   47.716
+215   51.216
+216   53.270
+217   49.865
+218   50.324
+219   49.916
+220   49.336
+221   56.814
+222   50.417
+223   50.910
+224   55.038
+225   61.760
+226   53.135
+227   53.262
+228   50.561
+229   48.315
+230   49.193
+231   53.704
+232   53.386
+233   61.107
+234   49.641
+235   49.387
+236   51.842
+237   52.700
+238   49.340
+239   52.748
+240   57.290
+241   49.655
+242   50.643
+243   52.568
+244   52.457
+245   54.264
+246   59.484
+247   52.176
+248   52.697
+249   63.909
+250   56.820
+251   52.252
+252   62.305
+253   51.512
+254   54.730
+255   51.264
+256   52.391
+"Word/Page" rjust at 80,  52.391
diff --git a/performance/lmbench3/doc/memhier.ms b/performance/lmbench3/doc/memhier.ms
new file mode 100644
index 0000000..cd81c2b
--- /dev/null
+++ b/performance/lmbench3/doc/memhier.ms
@@ -0,0 +1,1576 @@
+.\" This document is GNU groff -mgs -t -p -R -s
+.\" It will not print with normal troffs, it uses groff features, in particular,
+.\" long names for registers & strings.
+.\" Deal with it and use groff - it makes things portable.
+.\"
+.\" $X$ xroff -mgs -t -p -R -s $file
+.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more
+.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr
+.VARPS
+.\" Define a page top that looks cool
+.\" HELLO CARL!  To turn this off, s/PT/oldPT/
+.de PT
+.tl '\fBDRAFT\fP'\\*(DY'\fBDRAFT\fP'
+..
+.de lmPT
+.if \\n%>1 \{\
+.	sp -.1i
+.	ps 14
+.	ft 3
+.	nr big 24
+.	nr space \\w'XXX'
+.	nr titlewid \\w'\\*[title]'
+.	nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2
+.	ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25'
+.	ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0
+.	ce 1
+\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar]
+.	ps
+.	sp -.70
+.	ps 12
+\\l'\\n[LL]u'
+.	ft
+.	ps
+.\}
+..
+.\" Define a page bottom that looks cool
+.\" HELLO CARL!  To turn this off, s/BT/oldBT/
+.de BT
+.tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP'
+..
+.de lmBT
+.	ps 9
+\v'-1'\\l'\\n(LLu'
+.	sp -1
+.	tl '\(co 2001 \\*[author]'\\*(DY'%'
+.	ps
+..
+.de SP
+.	if t .sp .5
+.	if n .sp 1
+..
+.de BU
+.	SP
+.	ne 2
+\(bu\ 
+.	if \\n[.$] \fB\\$1\fP\\$2
+..
+.nr FIGURE 0
+.nr TABLE 0
+.nr SMALL .25i
+.de TSTART
+.	KF
+.	if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0
+.	ps -1
+.	vs -1
+..
+.de TEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr TABLE \\n[TABLE]+1
+.	ce 1
+\fBTable \\n[TABLE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.de FEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr FIGURE \\n[FIGURE]+1
+.	ce 1
+\fBFigure \\n[FIGURE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.\" Configuration
+.nr PI 3n
+.nr HM 1i
+.nr FM 1i
+.nr PO 1i
+.if t .po 1i
+.nr LL 6.5i
+.if n .nr PO 0i
+.if n .nr LL 7.5i
+.nr PS 10
+.nr VS \n(PS+1
+.ds title Micro-architecture analysis
+.ds author Carl Staelin
+.ds lmbench \f(CWlmbench\fP
+.ds lmbench1 \f(CWlmbench1\fP
+.ds lmbench2 \f(CWlmbench2\fP
+.ds lmbench3 \f(CWlmbench3\fP
+.ds bcopy \f(CWbcopy\fP
+.ds connect \f(CWconnect\fP
+.ds execlp  \f(CWexeclp\fP
+.ds exit \f(CWexit\fP
+.ds fork \f(CWfork\fP
+.ds gcc \f(CWgcc\fP
+.ds getpid \f(CWgetpid\fP
+.ds getpid \f(CWgetpid\fP
+.ds gettimeofday \f(CWgettimeofday\fP
+.ds kill \f(CWkill\fP
+.ds lat_mem_rd  \f(CWlat_mem_rd\fP
+.ds lat_ops  \f(CWlat_ops\fP
+.ds lmdd  \f(CWlmdd\fP
+.ds memmove \f(CWmemmove\fP
+.ds mmap \f(CWmmap\fP
+.ds par_mem  \f(CWpar_mem\fP
+.ds par_ops  \f(CWpar_ops\fP
+.ds popen  \f(CWpopen\fP
+.ds read \f(CWread\fP
+.ds stream \f(CWstream\fP
+.ds system  \f(CWsystem\fP
+.ds uiomove \f(CWuiomove\fP
+.ds write \f(CWwrite\fP
+.ds yield  \f(CWyield\fP
+.\" References stuff
+.de RN  \"Reference Name: .RN $1 -- prints the reference prettily
+.\" [\s-2\\$1\s+2]\\$2
+[\s-1\\$1\s0]\\$2
+..
+.\" .R1
+.\" sort A+DT
+.\" database references
+.\" label-in-text
+.\" label A.nD.y-2
+.\" bracket-label \*([. \*(.] ", "
+.\" .R2
+.EQ
+delim $$
+.EN
+.TL
+\s(14Micro-architecture analysis\s0
+.AU
+\s+2\fR\*[author]\fP\s0
+.AI
+\fI\s+2Hewlett-Packard Laboratories Israel\s0\fP
+.SP
+.AB
+\*[lmbench] version 3 includes a number of new micro-benchmarks 
+that analyze specific aspects of system micro-architecture,
+such as instruction level parallelism, the cache hierarchy and TLB. 
+.LP
+There are new benchmarks to measure instruction level
+parallelism, such as the effectiveness of overlapped
+memory accesses or arithmetic operations.
+There are other new benchmarks to measure various
+aspects of the architecture, such as the cache line
+size(s), TLB size, and latency costs for basic 
+arithmetic operations.
+\*[lmbench] can identify the number of caches, and the size, 
+line size, and available parallelism for each cache.  
+It can also measure the effective TLB size.
+.AE
+.if t .MC 3.05i
+.NH 1
+Introduction
+.LP
+\*[lmbench] version 3 includes a variety of new benchmarks
+designed to measure and analyze various aspects of memory
+system design and performance.  The most important aspect
+of memory subsystem performance is typically the memory
+hierarchy, the number and size of caches.  Other important
+aspects include the cache line size, TLB, and memory 
+parallelism.
+.LP
+There are any number of aspects of a computer's
+micro-architecture that can impact a program's
+performance, such as the design of the memory
+hierarchy and the basic performance of the various
+arithmetic units.
+.LP
+All of the new benchmarks were added to \*[lmbench]
+because the author needed them to help guide his
+design decisions in one or more projects over the
+last few years.  
+For example, \*[lat_ops] was added because the
+author was trying to decide whether a particular
+image processing algorithm should be implemented
+using integer or floating point arithmetic.
+Floating point arithmetic was preferred for a
+variety of reasons, but it was feared that 
+floating point arithmetic would be prohibitively
+expensive compared to integer operations.
+By quickly building \*[lat_ops] the author was
+able to verify that the floating point performance
+should be no worse than integer performance.
+.LP
+Memory speeds have not kept pace with the dizzying pace
+of processor performance improvements.  The result has
+been a steady increase in the relative cost of memory
+accesses, when measured in terms of instructions or
+clock ticks.  For example, a 2GHz processor with 200ns
+memory latency would wait roughly 400 instructions for
+a single memory access.  
+.LP
+To alleviate memory bottlenecks, architects use cache
+memory to reduce the average memory latency.  Typically
+there are between one and three caches in modern
+memory subsystems.  A rule of thumb is that each
+step down the memory hierarchy results in at least
+a doubling of memory latency and at least a doubling 
+of the cache size.
+.LP
+The details of the memory hierarchy design can have
+a significant impact on application performance
+.RN Whaley98 ,
+but unfortunaley developers frequently cannot predict
+the exact configuration of machines which will run
+their software.  Additionally, many developers are
+even unaware of the architectural details of their
+own machines.
+.LP
+One hope is that by providing a portable ANSI-C
+tool, developers may be better informed about the
+architectural possibilities provided by their
+own machines, and they may develop more efficient
+software which can automatically utilize features
+of the particular hardware based on information
+provided by these utilities.
+.LP
+For example, 
+.RN Staelin02c
+proposes variations on familiar data structures
+which take advantage of the increased memory
+parallelism afforded by modern processors to
+increase performance as much as 50%.
+.LP
+Before explaining the various algorithms and
+experimental methods for determining various
+aspects of the memory hierarchy design, we
+first give a short tutorial on memory system
+design.  Then we describe the basic techniques
+used in analyzing the memory hierarchy, and
+how they neutralize or measure various
+subsystems or features of the memory system.
+Finally, we describe in more detail the
+specific algorithms used to measure the various 
+aspects of the memory subsystem.
+.NH 1
+Computer Architecture Primer
+.LP
+A processor architecture is generally defined by its
+instruction set, but most computer architectures
+incorporate a large number of common building blocks
+and concepts, such as registers, arithmetic logic
+units, and caches.
+.LP
+Of necessity, this primer over-simplifies the
+many details and variations of specific computer
+designs and architectures.  For more information,
+please see 
+.RN Hennessy96 .
+.TSTART 1
+.so lmbench3_arch.pic
+.FEND "Architecture diagram" 1
+.LP
+Figure \n[FIGURE] contains a greatly simplified block diagram
+of a computer.  Various important elements, such as
+the I/O bus and devices, have been left out.  The
+core of the processor are the registers (r0, ..., rn
+and f0, ..., fn) and the arithmetic units (ALU and FPU).
+In general, the arithmetic units can access data in
+registers ''instantly''.  Often data must be explicitly
+loaded from memory into a register before it can be
+manipulated by the arithmetic units.
+.LP
+The ALU handles integer arithmetic, such as bit
+operations (AND, OR, XOR, NOT, and SHIFT) as
+well as ADD, MUL, DIV, and MOD.  Sometimes there
+is specialized hardware to handle one or more
+operations, such as a barrel shifter for SHIFT
+or a multiplier, and sometimes there is no
+hardware support for certain operations, such
+as MUL, DIV, and MOD.  
+.LP
+The FPU handles floating point arithmetic.
+Sometimes there are separate FPUs for single
+and double precision floating point operations.
+.NH 2
+Memory hierarchy
+.LP
+Nearly all modern, general purpose computers use
+virtual memory with phyically addressed caches.
+As such, there is typically one or more caches
+between the physical memory and the processor,
+and virtual-to-physical address translation
+occurs between the processor and the top-level
+cache.  Cache staging and replacement is done
+in \fIcache line\fR units, which are typically
+several words in length, and caches lower in 
+the hierarchy sometimes have cache lines which
+are larger than those in the higher caches.
+.LP
+Modern processors usually incorporate at least
+an L1 cache on-chip, and some are starting to
+also incorporate the L2 cache on-chip.  In
+addition, most include a translation look-aside
+buffer (TLB) on-chip for fast virtual-to-physical
+address translation.
+.LP
+One key element of any cache design is its
+replacement strategy.  Most caches use either
+direct-mapped or set associative caches.  In
+the first instance any word in physical memory
+has exactly one cache line where into which it
+may be staged, while set associative caches
+allow a given word to be cached into one of a
+set of lines.  Direct-mapped caches have a 
+very simple replacement policy: the contents
+of the line that is needed is discarded.
+Set associative caches usually use LRU or
+some variant within each set, so the least
+recently used line in the set of possible
+cache lines is replaced.  The control logic
+for direct-mapped caches is much cheaper to
+build, but they are generally only as 
+effective as a set-associative cache half
+the size.\**
+.FS 
+See
+.RN Hennessy96
+page 396.
+.FE
+.LP
+Another key element of memory hierarchy design
+is the management of dirty data; at what point
+are writes passed down the memory hierarchy to
+lower caches and main memory?  The two basic
+policies are write-through and write-back.
+A write-through policy means that writes are
+immediately passed through the cache to the
+next level in the hierarchy, so the lower
+levels are updated at the same time as the
+cache.  A write-back policy means that the
+cache line is marked as dirty in the cache,
+and only when the line is ejected from the
+cache is the data passed down the hierarchy.
+Write-through policies are often used in 
+higher (smaller) caches because multi-
+processor systems need to keep a coherent
+view of memory and the writes are often
+propagated to other processors by \fIsnoopy\fR
+caches.
+.LP
+One often overlooked aspect of cache
+performance is cache behavior during
+writes.  Most cache lines contain
+several words, and most instructions
+only update the line a word at a time.
+This means that when the processor
+writes a word to a cache line that is
+not present, the cache will read the
+line from memory before completing the
+write operation.  For \*[bcopy]-like
+operations this means that the overall
+memory bandwidth requirement is actually
+two reads and one write per copied word,
+rather than the expected read and write.
+.LP
+Most modern processors now include some form
+of prefetch in the memory hierarchy.  For
+the most part these are simple systems that
+can recognize fixed strided accesses through
+memory, such as might be seen in many array
+operations.  However, prefetching systems
+appear to be growing in complexity and
+capability.
+.LP
+Additionally, modern memory subsystems can
+usually support multiple outstanding requests;
+the level of parallelism is usually dependent
+on the level of the hierarchy being accessed.
+Top-level caches can sometimes support as 
+many as six or eight outstanding requests,
+while main memory can usually support two
+outstanding requests.  Other elements of 
+the memory hierarchy, such as the TLB, often
+have additional limits on the level of
+achievable parallelism in practice.\**
+.FS 
+For example, if the TLB serializes all
+TLB misses, and if each memory access
+causes a TLB miss, then the memory
+accesses will be serialized even if
+the data was in a cache supporting
+six outstanding requests.
+.FE
+.LP
+For more information and details on memory 
+subsystem design, and computer architecture
+in general, please see
+.RN Hennessy96
+which has an excellent description of these
+and many other issues.
+.NH 2
+Some Recent Innovations
+.LP
+There are a number of modern extensions to computer
+architecture that attempt to increase the processor's
+ability to do several things at once.  Nearly all of
+these enhancements are intended to be invisible to
+programmers using higher-level languages such as
+C or JAVA.
+.IP "\fBSuperscalar processors\fR"
+Superscalar processors have multiple processing
+units which can operate simultaneously.  
+.IP "\fBDynamic instruction reordering\fR"
+Dynamic instruction reordering allows the processor
+to execute instructions whose operands are ready
+before instructions which are stalled waiting for
+memory or other instruction's completion.
+.IP "\fBMemory parallelism\fR"
+By allowing multiple outstanding memory requests,
+processors allow the memory subsystem to service
+multiple (independent) requests in parallel. 
+Since memory accesses are a common performance
+bottleneck, this can greatly improve performance.
+.IP "\fBVector processing\fR"
+Vector processing allows the processor to execute
+arithmetic operations on vector operands in 
+parallel, and in modern commodity processors goes
+by names such as MMX, SSE, and 3DNow.
+.IP "\fBSimultaneous multi-threading (SMT)\fR"
+SMT allows superscalar processors to simulatenously
+execute instructions from several threads (contexts)
+.RN Tullsen96 .
+SMT may include extensions which allow for very
+lightweight inter-thread synchronization primitives
+that enable much finer-grained thread-level 
+parallelism than traditional synchronization
+methods
+.RN Tullsen99 .
+.IP "\fBExplicitly parallel instruction computers (EPIC)\fR"
+EPIC allows the compiler to explicitly issue $N$
+instructions in parallel at each instruction, which
+informs the hardware that these instructions are
+independent and may be executed in parallel
+.RN Schlansker00 .
+It moves much of the burden regarding dependency
+checking from the hardware to the compiler.
+.NH 1
+Basic operation latency
+.LP
+\*[lmbench3] includes a new micro-benchmark 
+which measures the latency for a variety of basic
+operations, such as addition, multiplication, and
+division of integer, float, and double operands.
+To measure the basic operation latency we construct
+a basic arithmetic statement containing the desired
+operands and operations.  This statement is repeated
+one hundred times and these repetitions are then
+embedded in a loop.  
+.TSTART
+.TS
+center box tab (&);
+c c c
+l & l & l .
+Operand&Operation&Statement
+_
+int&$bit$&r^=i;s^=r;r|=s;
+&$add$&a+=b;b-=a;
+&$mul$&r=(r*i)^r;
+&$div$&r=(r/i)^r;
+&$mod$&r=(r%i)^r;
+_
+float&$add$&f+=f;
+&$mul$&f*=f;
+&$div$&f=g/f;
+_
+double&$add$&f+=f;
+&$mul$&f*=f;
+&$div$&f=g/f;
+.TE
+.TEND "lat_ops statements"
+.LP
+Table \n[TABLE] shows the data type and expressions
+used for each basic operation type.  The variable
+$i$ indicates the integer loop variable and generally
+changes every ten or hundred evaluations of the
+basic expression.  All other variables are of
+the basic type being measured, and aside from
+being modified by the relevant expressions are
+only initialized once at the beginning of the
+benchmark routine.
+.LP
+Each statement has been designed to ensure that
+the statement instances are \fIinterlocked\fR,
+namely that the processor cannot begin processing
+the next instance of the statement until it has
+completed processing the previous instance.  This
+property is crucial to the correct measurement of
+operation latency.
+.LP
+One important consideration in the design of
+the statements was that they not be optimized
+out of the loop by intelligent compilers.  
+Since the statements are repeated one hundred
+times, the compiler has the option of evaluating
+the sequence of one hundred repetitions of the
+same statement, and sometimes it can find
+optimizations that are not immediately 
+apparent.  For example, the integer statement
+$a=a+a;$ when repeated one hundred times in
+a loop can be replaced with the single statement
+$a=0;$ because the statement $a=a+a;$ is equivalent
+to $a< < =1;$, and one hundred repetitions of that
+statement is equivalent to $a< < =100;$, which for
+32bit (or even 64bit) integers is equivalent to
+$a=0;$.  
+.LP
+It is relatively easy to identify floating
+point statements that interlock, are not
+optimized away, and that only use the operation
+of interest.
+It is much harder to identify integer statements
+meeting the same criterion.  All simple 
+integer bitwise operations can either be optimized
+away, don't interlock, or use operations other
+than one of interest.
+We chose to add operations other than the 
+operation(s) of interest to the statements.
+.LP
+The integer $mul$, $div$, and $mod$ statements all 
+include an added $xor$ operation which prevents
+(current) compilers from optimizing the statements
+away.  Since the $xor$ operation is generally
+completed in a single clock tick, and since
+we can measure the $xor$ operation latency
+separately and subtract that overhead, we can
+still measure the latencies of the other 
+operations of interest.
+.LP
+It is not possible to measure latency for 64bit
+operations on 32bit machines because most
+implementations allow operations on the upper
+and lower bits to overlap.  This means that
+on most 32bit machines, the measured latency
+would appear to be a non-integral multiple of
+the basic clock cycle.  For example, in the
+$add$ statement, the system could first add
+the two lower words.  Then, in parallel it
+could both add the two upper words (along with
+the carry from the lower words), and compute
+the $xor$ of the lower word.  Finally, it
+can overlap the $xor$ of the upper word
+with the addition of the two lower words from
+the next instantiation of the statement.
+.TSTART
+.TS
+center box tab (&);
+c c c c c
+c c c c c
+l & l & r & r & r .
+Operand&Op&HPPA2.0&PIII&AMD
+&&400MHz&667MHz&1.3GHz
+_
+mhz&&2.50&1.50&0.75
+int&$bit$&2.53&1.50&0.75
+&$add$&2.50&1.51&0.75
+&$mul$&14.52&6.07&3.03
+&$div$&109.40&58.52&30.86
+&$mod$&75.14&65.01&32.59
+_
+float&$add$&7.54&4.58&3.0
+&$mul$&7.50&7.50&3.0
+&$div$&45.00&35.26&13.21
+_
+double&$add$&7.52&4.53&3.01
+&$mul$&7.52&7.71&3.01
+&$div$&85.01&35.51&13.16
+.TE
+.TEND "lat_ops results (ns)"
+.LP
+Table \n[TABLE] contains some sample results
+for two processors.  
+It does contain one result which is slightly
+surprising unless you are familiar with the
+PA-RISC architecture: floating point multiply
+and divide are faster than the corresponding
+integer operations!  This is because PA-RISC
+does not contain integer MUL, DIV, or MOD
+instructions and the optimizing compiler
+converts the integers into floating point,
+does the operations in the floating point
+unit, and then converts the result back
+to an integer.
+.NH 2
+Basic operation parallelism
+.LP
+Instruction-level parallelism in commodity processors
+has become commonplace in the last ten years.  
+Modern processors typically have more than one
+operational unit that can be active during a
+given clock cycle, such as an integer arithmetic
+unit and a floating point unit.  In addition, 
+processors may have more than a single instance
+of a given type of operational unit, both of
+which may be active at a given time.  All this
+intra-processor parallelism is used to try and
+reduce the average number of clock cycles per
+executed instruction.  
+.LP
+\*[lmbench3] incorporates a new benchmark \*[par_ops]
+which attempts to quantify the level of available
+instruction-level parallelism provided by the processor.  This 
+benchmark is very similar to \*[lat_ops], and
+in fact uses the same statement kernels, but it
+has been modified and extended.  We create
+different versions of each benchmark; each
+version has $N$ sets of interleaved statements.
+Each set is identical to equivalent \*[lat_ops]
+statements.  In this way multiple independent
+sets can be executing the same operation(s)
+in parallel, if the hardware supports it.
+.LP
+For example, the float $mul$ benchmark to measure
+performance with two parallel streams of statements
+would look like something this:
+.DS L
+\f(CW#define TEN(a) a a a a a a a a a a
+void benchmark_1(iter_t iterations, void* cookie)
+{
+    register iter_t i = iterations;
+    struct _state* state = (struct _state*)cookie;
+    register float f0 = state->float_data[0];
+    register float f1 = state->float_data[1];
+
+    while (i-- > 0) {
+        TEN(f0*=f0; f1*=f1;)
+    }
+    use_int((int)f0);
+    use_int((int)f1);
+}\fP
+.DE
+.LP
+If the processor had two floating point multiply
+units, then both $f0$ and $f1$ multiplies could
+proceed in parallel.
+.LP
+However, there are some potential problems with
+the integer operations, namely the fact that the
+statements contain mixed operations.  In general,
+processors have at least as many integer units
+that can do $xor$ as can do the other operations
+of interest ($mul$, $div$ and $mod$), so the
+inclusion of $xor$ in the statements shouldn't
+be a bottleneck.  
+.LP
+However, since parallelism is measured by comparing 
+the latency of the single-stream with that of 
+multiple interleaved streams, and since the single-stream 
+latency includes the $xor$ latency, the apparent 
+parallelism of $mul$, $div$, $mod$ can be over-stated.
+For example, if a process has one unit that can
+do integer bit operations, such as $xor$, and another
+unit for integer $mul$ operations, then the average
+latency for $a0 = (i * a0) ^ a0$ in the single stream 
+case would be:
+.EQ
+t bar = t sub xor + t sub mul
+.EN
+In the multi-stream case, the execution of the $xor$ 
+operation of one stream can be overlapped with the 
+$mul$ of another stream, so the average latency per 
+stream would simply be $t bar = t sub mul$, assuming 
+that $mul$ operations are not cheaper than $xor$ 
+operations, which results in an apparent parallelism 
+$p tilde$:
+.EQ
+p tilde = {t sub xor + t sub mul} over { t sub mul }
+.EN
+Assuming that $t sub xor < < t sub mul$, this
+still gives a reasonable approximation to
+the correct answer.  Unfortunately, this is
+not always a reasonable assumption.
+.LP
+Of course, if it was known ahead of time that
+$xor$ and { $mul$, $div$, and $mod$ } used
+different execution units, then the benchmark
+could simply subtract $t sub xor$ from the
+baseline measurement.  The difficulty lies
+in determining whether the units overlap
+or not.
+.TSTART
+.TS
+center box tab (&);
+c c c c c
+c c c c c
+l & l & r & r & r .
+Operand&Op&HPPA2.0&PIII&AMD
+&&400MHz&667MHz&1.3GHz
+_
+int&$bit$&1.99&1.70&1.87
+&$add$&1.99&1.61&1.90
+&$mul$&6.64&3.81&2.00
+&$div$&2.81&1.20&1.00
+&$mod$&2.78&1.11&1.03
+_
+float&$add$&5.88&1.00&2.66
+&$mul$&5.86&1.14&2.47
+&$div$&2.12&1.03&1.14
+_
+double&$add$&5.68&1.08&2.49
+&$mul$&5.58&1.00&2.53
+&$div$&2.19&1.03&1.14
+.TE
+.TEND "par_ops results"
+.LP
+.NH 1
+Memory analysis
+.LP
+There are a variety of aspects of memory hierarchy design
+that are interesting to a software developer, such as
+the number of caches and their sizes.  In addition, other
+aspects of cache design, such as the line size,
+associativity and parallelism can impact software
+performance and are of potential interest to software
+developers.
+.LP
+The problem is designing a portable ANSI-C program to
+infer the cache parameters.  A number of operating
+systems have hooks to report at least certain aspects
+of cache and memory hierarchy design, but any program
+utilizing those hooks would not be fully portable
+across hardware and operating system platforms.  
+.LP
+The key observation is that caches help reduce memory
+latency.  In a perfect world, all possible data would
+fit in the cache, so a graph of average memory latency
+versus amount of memory utilized would look like a
+series of plateaus separated by cliffs.  The cliff
+edges would be located at the cache boundaries and
+the plateau height would be the average memory latency.
+.LP
+The first problem is that one needs a mechanism for
+accurately measuring time in a portable fashion.  
+\*[lmbench2] introduced a new timing harness
+that determines the minimum duration of a timing interval
+for \*[gettimeofday] to provide accurate measurements
+.RN Staelin98 .
+.LP
+\*[lmbench] includes a benchmark that measures 
+average memory latency, \*[lat_mem_rd]
+.RN McVoy96 .
+It creates a pointer chain, and then measures the
+average time to dereference the pointers.  
+\*[lat_mem_rd] creates the pointer chain by simply
+striding through memory at fixed intervals, e.g.
+every other word.
+.LP
+\*[lmbench2] extended \*[lat_mem_rd] so
+that each timing interval only accessed memory
+as many times as necessary to consume a timing
+interval.  When accessing cache this often means
+that the whole pointer chain will be accessed
+at least once during the timing interval, but
+when accessing memory this often means that only
+a portion of the chain will be accessed during
+any given timing interval.
+.LP
+While this approach gives very useful insights
+into memory hierarchy performance, it is not
+quite sufficient to determine the various 
+characteristics of the memory hierarchy.
+.LP
+The first problem is that unless the stride is
+exactly the same size as the cache line size, then
+there will either be multiple successive accesses
+to the same line, or some fraction of data
+will be completely skipped.  In the first case
+the observed latency is much faster than the
+true latency because it is the average of a
+single miss latency (slow) with one or more
+hit latencies (fast).  In the second case, the
+amount of data actually loaded into the cache
+may be a small fraction of the expected amount
+so the data may fit into a smaller (faster) 
+cache.
+The second problem is that this sequence is
+highly predictable, even by simple-minded
+prefetching policies, so accurate prefetching 
+might be masking the true memory latencies.
+.LP
+This method does do a few things properly.
+First of all, accesses to a single page are
+clustered together so the TLB miss cost (if
+any) is amortized over as many accesses as
+possible.  Secondly, assuming the pointer
+chain is laid out unpredictably, the memory
+subsystem must wait for the previous load
+to complete before it can initiate the
+next load, so we can measure the true latency.
+.NH 2
+Prefetching
+.LP
+Some memory subsystems have been highly optimized to
+recognize and automatically prefetch memory when 
+given "predictable" memory access streams, such as 
+when striding through array accesses.  This means that
+the memory access stream generated by \*[lmbench]
+must be unpredictable by the standard prediction
+algorithms.
+.LP
+The original \*[lmbench] memory latency benchmark, 
+lat_mem_rd, built a chain of pointers that would
+stride backwards through memory.  This was able to
+defeat many simple prefetching algorithms of the
+time, but some systems came to incorporate prefetching
+algorithms that recognized strided accesses in
+both directions.
+.LP
+The obvious method for producing an unpredictable
+chain of line references is to use a random
+permutation of line indexes.  
+.LP
+\*[lmbench] uses a deterministic algorithm to compute
+the reference chain which guarantees that references
+are as far away from previous accesses in both time
+and space as possible.  Basically, the binary bits
+representing the line index are reversed, so that
+1101 becomes 1011, or 001 becomes 100.  This only
+works if the number of cache lines is an even power
+of two, but since page sizes and line sizes are
+always powers of two, this assumption is valid.\**
+.FS 
+At least this is the case in every modern system known 
+to the author.
+.FE
+.LP
+Additionally, since higher-level caches can have
+smaller line sizes than lower-level caches, it
+is necessary to access every word in the relevant
+chunk of memory.  However, accesses to words in
+the same line must be separated in time by accesses
+to the rest of the memory.  This is achieved by
+identifying the line size for the largest cache,
+and then setting up the chain so that there is
+one pass through the memory for each word in the
+line with the sequence of words being determined
+by the bit-reversal method described above.  
+.LP
+For example, suppose a system has 4KB pages, the
+largest cache has a line size of 64bytes, and a
+word is 4bytes.  Then each page would have 64 lines,
+and each line would have 16 words.  The system
+would setup a pointer chain that visits each line
+on each page using the zeroth word; at the end of
+the chain it would then jump to the start of the
+pages and visit each line on each page using the
+eigth word, and so forth until each word had been
+visited.  
+.NH 2
+Dirty data
+.LP
+An additional issue that we need to take into 
+account is the cache's policy for dirty data.  
+Many caches use a copy-back policy, while others
+use a write-through policy.  
+.LP
+Different caches on the same machine may use
+different policies.  Also, cache performance
+can be affected by the presence of dirty data.
+For example, suppose both the L1 and L2 caches
+use a copy-back policy, and suppose that the
+access time for reading data located in L2
+depends on whether the data being ejected from
+L1 is dirty and needs to be copied back from L1
+to L2 before the read from L2 to L1.
+In this case, a benchmark which writes a pointer
+chain that fits in L2 but is larger than L1,
+and then measures the time to follow the chain,
+will get a different average memory latency than
+a benchmark which writes the same chain and
+reads enough data to flush the L2 cache before
+measuring the time to follow the chain.
+In the first case, each application read will
+result in a write from L1 to L2 followed by
+a read from L2 to L1, while in the second
+case each application read will only result
+in a read from L2 to L1.
+.LP
+Since it is possible that average memory latencies
+for a read-only access stream may be increased if
+any of the data in the cache is dirty, we need to
+flush the cache after setting up the pointer
+chains and before we do any measurements.  
+Otherwise, when we access a pointer chain that
+is larger than the L1 cache but smaller than the
+largest cache, dirty data can reside in the lowest
+(largest) cache and as each line is staged from
+the largest cache to the L1 cache, it is marked
+as dirty in the L1 cache.  Then when each dirty
+line is flushed from the L1 cache (to the L2
+cache), the system has to write the data back to
+L2, which delays the load of the next (dirty)
+line from L2 to L1.
+.LP
+To flush the cache we read (and sum) a large
+amount of memory, which should be several times
+larger than the largest cache.  In this way,
+all dirty data in the cache should be flushed
+from the cache without creating additional
+dirty data.
+.NH 2
+Page mapping
+.LP
+Complicating the issue still further is the fact that
+caches do not use full LRU replacement policies.  Nearly
+all caches use some form of set associativity, where
+pages are directed to a pool of cache lines based on
+the physical address.  Replacement within the pool is
+typically LRU.  Direct-mapped caches are a special case
+where the pool size is a single line.
+.LP
+Additionally, some systems use victim caches, which are
+typically small caches which caches recently discarded
+cache lines.  Victim caches can be particularly effective
+for direct-mapped caches by reducing the cache miss
+rate caused by colliding hot spots.
+.LP
+However, page mapping and its attendant cache collisions
+is under the control of the kernel, and is in fact 
+invisible to user-land programs.  Some operating
+systems make an effort to minimize possible page collisions 
+when giving memory to processes\**, while other operating 
+systems appear to simply grab the first available pages, 
+regardless of potential cache collision effects.
+.FS 
+This is generally known as "page coloring", and is much
+more important on systems with direct-mapped caches than
+those with N-way set associative caches.
+.FE
+.LP
+Factoring out page placement affects on average memory
+latency is very difficult, but it is necessary to 
+ensure that the correct cache size is identified.
+.NH 1
+Cache line size
+.LP
+The first feature of the memory hierarchy we
+will try to analyze is the cache line size,
+since we can find the line size for the 
+largest cache without any other knowledge of
+the system, and since determining nearly all
+other aspects of the memory subsystem either
+require or are greatly simplified by knowing
+the cache line size.
+.LP
+The most obvious aspect of cache design is that replacement
+is done on a per-line basis, and cache lines often contain
+several words of data (32-128bytes per line is common).
+However, it is necessary to ensure that we don't 
+generate "spurious" cache hits by referencing a word from 
+a cache line that was recently accessed.  We must ensure 
+that each line is only re-referenced after all other 
+memory in the buffer has been referenced.
+.LP
+Unfortunately, we usually do not know the cache line size
+ahead of time.  In addition, sometimes systems contain
+several caches, and each cache can use a different line
+size!  Usually line sizes are powers of two, and usually
+the smaller (higher) caches have line sizes which are the
+same or smaller than the larger (lower) caches.  However,
+we still need to ensure that we access all cache lines
+for all caches without generating the spurious cache hits.
+.LP
+Determining the cache line size requires a series of
+experiments.  The basic observation is that when the
+amount of memory being accessed is larger than the
+cache, and when the access chain is arranged properly,
+then each memory reference causes a cache miss.  If
+however, a word on a recently access line is requested,
+then that reference will be a cache hit.  More
+completely, the average memory access time $t bar$
+is:
+.EQ
+t bar = t sub miss + ( n - 1 ) t sub hit
+.EN
+expressed as a function of $n$, the number of accesses 
+to the cache line, $t sub miss$, the cache miss latency, 
+and $t sub hit$, the cache hit latency.
+.TSTART
+.G1
+.so memhier-line.d 
+.G2
+.FEND "Line Size"
+.LP
+We can determine the cache line size by measuring
+the average memory access latency over a series of
+memory access patterns: accessing every word, every
+other word, every fourth word, every eighth word, ...
+While the system is accessing multiple words per
+cache line, the average memory latency will be
+smaller than the cache miss latency, and as the
+space between accesses increases, the average
+memory increase will grow.
+When the system accesses only one word per line,
+the average memory latency will remain level even
+as the spacing between accesses increases.
+.LP
+It is possible to utilize this behavior to identify
+the cache line size.  The algorithm is to measure
+the average memory latency when each word is 
+accessed.  Then as you increase the space between
+accessed words (doubling the space each iteration),
+you look for a situation where the average latency
+increased dramatically, say greater than 30%,
+followed by a levelling off on the next iteration,
+say an increase less than 15%.  The line size is
+the last point where the average latency jumped
+dramatically.
+.NH 1
+TLB
+.LP
+Measuring the TLB-miss costs assumes that one can isolate
+those costs from the rest of the memory access costs.  The
+key observation is that it is often possible to create a
+situation in which all data being accessed resides in the
+cache, and yet it requires a TLB-miss to be able to locate
+it.
+.LP
+This program identifies the effective TLB size, rather 
+than the true TLB size.  First of all, from a programmer's
+point of view, it is really the effective TLB size that
+impacts program performance.  Secondly, there is no way
+for a user-land program to measure true TLB size because
+kernels sometimes pin some kernel page mappings into the 
+TLB and because some hardware/OS combinations 
+support "super-pages", or multi-page mappings.
+.LP
+We create two similar pointer chains with identical length
+and which reference an identical amount of memory, with one
+key difference.  In the first chain, the data is packed
+tightly into as few pages as possible, and references
+remain within a single page as long as possible.  The
+second chain spreads the data over as many pages as
+possible and jumps between pages at each reference.
+The two chains are arranged so that the same amount of
+data will fit into the cache, so that the raw memory
+access time for each chain is identical, within 
+experimental constraints.  The sole difference between
+average access costs should be the TLB-lookup times.
+.LP
+When the pages from the second chain fit into the TLB,
+the average access times for the two chains should be
+identical.  However, as soon as the number of pages in
+the second chain exceeds the TLB size, the second
+chain will start to pay frequent TLB-miss costs.
+Depending on the TLB replacement policy, the fraction of 
+requests generating TLB-misses in the second chain can vary
+dramatically\**.  
+.FS 
+Pure LRU would ensure that as soon as the chain was one 
+page longer than the TLB size, every access would trigger 
+a TLB-miss.  However, other replacement algorithms might
+result in as few as $"number of pages" - "TLB size" + 1$
+misses per iteration over the loop.
+.FE
+.TSTART
+.G1
+.so memhier-tlb.d
+.G2
+.FEND "TLB"
+.LP
+The system must search for the point at which the
+average memory latency of the second chain diverges
+from the average latency of the first chain.  Since
+most systems have relatively small TLBs and since
+checking TLB sizes smaller than the effective TLB
+size is faster than checking TLB sizes larger than
+the TLB, the system starts with the guess of eight
+pages to establish a baseline.  It then iteratively
+doubles the number of pages until either a maximum
+limit has been reached or the average TLB-miss cost
+is greater than 15% of the average memory latency.
+Once it discovers the upper bound on the possible
+TLB size, it uses a binary search between the last
+two TLB size guesses to find the point at which
+the average latency for the two streams diverge.
+.NH 1
+Cache size
+.LP
+For the purpose of identifying the cache size, the
+ideal situation is that as long as the amount of 
+memory is equal to or less than the cache size, then
+all the data is in the cache and the average memory
+latency is the cache hit latency.  As soon as the
+memory doesn't fit in cache, then none of it should
+be in the cache, so the average memory latency is
+the cache miss latency.\**  When examining average
+memory latency versus memory size, this would give
+nice flat plateaus for each cache, with nice sharp
+transitions from one cache to the next, and from the
+largest cache to main memory.
+.FS
+Of course, for real programs, you want the average
+memory latency to be as low as possible, which means
+that you want as much of the data in cache as possible.
+.FE
+.LP
+However, the realities are that real data from real
+systems is corrupted in a variety of ways.  
+First of all, even when the memory can fit into the 
+cache, pages often collide in the cache and the 
+fraction of pages that have collisions often 
+increases as the amount of memory nears the cache size.  
+Secondly, even when the memory cannot fit into the 
+cache, there can be pages that do not collide. 
+Finally, there is simple experimental noise, which is 
+usually limited to 1% or less.  
+.LP
+The result of the first two problems is that on
+some systems, the average memory latency increases
+gradually as the memory size is increased.  There
+are no flat plateaus and sharp cliffs which make
+it easy to identify the number, size, and 
+performance of the caches.
+.NH 2
+Page coloring
+.LP
+The first problem is to create a set of pages
+which do not collide in the cache.
+The solution is to allocate more memory
+than necessary, and to try different combinations
+of pages to find the page set with the fastest
+average memory latency.  Unfortunately, the obvious
+algorithm is exponential in the number of pages.
+.TSTART
+.G1
+.so memhier-color.d
+.G2
+.FEND "Page Coloring Effects"
+.LP
+One observation is that cache misses are usually
+much more expensive than cache hits.  So, one
+possibility is to choose a random set of pages
+as the baseline and measure the average memory
+latency.  Then iterate over the pages, removing
+that page from the set and measuring the average
+memory latency of the reduced set.  If that page
+collides with another page, then the average 
+memory latency for the reduced set should be smaller
+than the average latency for the whole set.
+.LP
+Once a page that collides has been identified, then
+the system can iterate through available pages,
+try adding them to the reduced set and measuring
+the average memory latency.  If the page doesn't
+collide with any pages in the reduced set, then
+the average memory latency should drop still further.  
+In this way, the system could identify all
+colliding pages and replace them with pages
+that don't collide (assuming the memory all
+fits in the cache).
+.LP
+There are a number of problems with this simple approach.  
+First of all, it would take a very long time to run due 
+to the large, but polynomial, number of experiments required.  
+Secondly, as the memory size increases and the 
+number of pages involved gets large, the effect 
+of a single page on the average memory latency 
+can reach the level of experimental noise.
+.LP
+This approach makes the assumption that physical
+page locations do not change once the memory
+has been allocated.  In most systems, this 
+assumption is valid unless the memory is paged
+to disk.  However, at least IRIX includes an
+operating system configuration option to allow
+the operating system to dynamically relocate
+pages in memory.  This capability is disabled
+by default, so its use is relatively uncommon.
+It is possible that page relocation will become
+more common in the future, in which case this
+design may need to be revisited in the future.
+.LP
+Our algorithm uses this basic approach, but 
+attempts to reduce the number of experiments
+required by removing chunks of pages at a time.
+It will remove up to 5% of pages at a time
+and see if the average memory latency decreases
+significantly, in which case it examines the
+chunk a page at a time to find the page or
+pages which probably conflict.
+.LP
+An additional problem is that for large caches,
+the measured difference between two sets of
+pages with just one page collision difference
+can be very hard to measure.  For example,
+on a system with a 512Kbyte L2 cache and 4Kbyte
+pages, the cache can hold 128 pages.  Assuming
+that a cache miss is 200ns, a cache hit is 50ns,
+and 123 pages have no collisions but 5 pages
+collide, then the average memory latency is
+.EQ
+t bar = { 123 times 50 + 5 times 200 } over 128
+.EN
+or 55.85ns.  Suppose we remove one page and
+replace it with another page which doesn't
+collide, so we now have 4 collisions and
+124 pages without collisions, then the
+average memory latency is 54.68ns.  The
+difference is generally significant even
+in the face of experimental noise, but for
+larger caches the differences may recede 
+into the background noise.
+.LP
+As caches increase in size, the problems
+associated with detecting page collisions
+can only increase.  
+For example, an 8MB cache on a system with
+4KB pages would contain 2,048 pages.
+Removing a single page collision, even when
+the resulting memory latency for that page
+reduces by a factor of four, would simply
+result in an overall reduction in average
+memory latency of less than 0.2%, which is
+smaller than the average experimental measurement
+errors.
+.LP
+Additionally, as caches increase in size,
+effects such as cache consumption by the
+page table can begin to become important.
+.LP
+The single largest remaining problem in our
+system is that this algorithm does not 
+guarantee that we find a set of pages
+which do not contain any collisions in all
+cases that it \fImight\fR find such a set.
+It merely does so \fImost\fR of the time
+with (relatively) few measurements.
+.LP
+One possible means of dealing with this
+problem is to try an remove sets of pages
+in the hope that enough pages from a set
+of colliding pages will be removed at 
+once, so that the remaining pages from
+that collision set won't collide anymore.
+Suppose you have a 4-way set associative
+cache, and that you have six pages that
+collide.  If you remove two of the pages,
+then the remaining four pages don't collide
+anymore either.  This means that by 
+removing two pages we have removed six
+collisions, which should be easier to
+detect.
+.LP
+XXX Look into randomizing the pages
+after each iteration of the top-level
+loop to make this sort of serendipitious
+event more likely.
+.NH 2
+Measurement
+.LP
+In order to reduce the number of memory sizes
+that are measured by the system, we use a 
+binary search on memory sizes to find "edges"
+in the memory latency.
+We make the simplifying assumption that cache 
+sizes are either a power of two, or 1.5 times 
+a power of two.  In our experience, this assumption
+has been true.
+We also assume that no cache is smaller than
+512 bytes.
+.LP
+We explore the memory space at intervals
+equivalent to the most recent power of two
+divided by four.  So, starting at one 
+megabyte we would (potentially) measure
+memory latency at 1MB, 1.25MB, 1.5MB, and
+1.75MB.  This allows us to detect
+cache sizes at the desired intervals, since
+the measurement at the exact cache size
+can often be corrupted by other system
+activity so the next smaller measurement
+should still be valid.
+.LP
+XXX If the measurement size increment is
+several times larger than a page, then
+perhaps we should actually measure the 
+system with a couple pages less than the
+stated size?
+This would allow us some "slop" for
+collisions and might make it easier near
+cache boundaries to get accurate 
+measurements.
+The "slop" should probably be some fraction
+of the measurement increment size, such as
+10%, so it scales properly.
+.LP
+Since we start with a maximum size as a given,
+and we use 512 bytes as a minimum, and we can
+compute the full set of possible measurements,
+and initialize an array with the desired sizes.
+We can then use a modified binary search on
+this array to efficiently locate cache edges
+while still (potentially) leaving large, flat
+plateaus unexplored between the end points.
+.LP
+Finally, we assume that true memory latency
+is monotonically increasing with the amount
+of memory that you access.
+This means that if the measured latency ever
+decreases as you increase the amount of
+accessed memory, then the previous measurement
+must have been an error and the value is
+replaced by the smaller measurement.
+.NH 2
+Data analysis
+.LP
+Assuming the data collected by the system
+were noise-free and that the experimental
+system had managed to eliminate all artifacts
+such as page coloring effects, then the
+next problem is to analyze the data to find
+the number and size of the caches.
+Basically this means examining the data to
+find plateaus and cliffs.
+Each plateau would represent a cache, and the
+cliff represents the edge (size) of the cache.
+.LP
+Of course, real data is never perfect, and
+there are any number of issues which can
+affect the experimental results, so the
+analysis methodology must be robust to noise.
+.LP
+XXX describe analysis methodology here
+.NH 1
+Cache associativity
+.LP
+No modern caches are fully associative, meaning that
+no caches use LRU replacement, because the performance
+overhead for LRU is so severe.  Most caches are 
+either set associative or direct mapped, meaning
+that data from a given location can only go to
+one of a small number of cache lines, and in the
+case of a direct-mapped cache to a single cache line.
+.LP
+To determine the cache associativity we need to find
+a set of pages which have no page collisions and
+which (just) fit into the cache.  We then need to
+locate a page which collides with these pages and
+append it to the set.
+Then we can iterate through the pages in the initial
+page set, removing a page at a time, and comparing
+the resulting average memory latency with that of
+the full set.
+When the average memory latency drops significantly,
+then we know that this page conflicts with the
+full page set, and since the page set only has one
+conflict, we know it conflicts with the newly
+introduced page.
+The number of pages that conflict with this newly
+introduced page is the set associativity.
+.LP
+There is a potential bug in this algorithm 
+for systems with victim caches!  
+If the victim cache can hold at least a page 
+of data, then this algorithm cannot properly 
+determine the cache associativity because the 
+victim cache will play the role of additional 
+associative cache lines.
+.LP
+For smaller caches there is the additional
+problem that the cache associativity may not
+be smaller than the number of pages that the
+cache may hold.
+In which case, this simple approach will 
+never find pages that collide in the cache.
+The solution to this problem is to increase
+the line size and the number of pages so that 
+only portions of each page are accessed, and
+there can be enough pages to create collisions.
+.NH 1
+Memory parallelism
+.LP
+With the increasing memory bottleneck, most modern
+systems allow multiple outstanding memory references.
+On many systems, the effective parallelism depends
+on which part of the memory hierarchy is being
+accessed.  For example, L1 caches can often service 
+as many as six or eight outstanding requests, while main 
+memory systems can usually support at most two 
+outstanding requests.
+.LP
+To measure the available parallelism for a given
+chunk of memory, the system sets up a pointer
+chain running through the memory exactly the same
+as if it were to measure the average memory 
+latency.  It then uses fifteen different access
+routines, one for each possible level of parallelism.\**
+.FS 
+The assumption here is that no memory subsystem
+supports more than sixteen accesses in parallel.
+.FE
+Each routine dereferences $N$ pointers in parallel.
+For example, the inner loop of the routine where 
+$N=2$ would look something like this:
+.DS L
+\f(CWwhile (iterations-- > 0) {
+	p0 = (char**)*p0;
+	p1 = (char**)*p1;
+}\fP
+.DE
+.LP
+The available parallelism is the maximum speedup
+over all N compared to the sequential case.
+.LP
+Note that this value is often not integral because
+many factors go into the effective parallelism,
+such as TLB contention, can limit the effective
+parallelism.
+.NH 1
+DRAM pages
+.LP
+Within DRAM chips there is usually one or more
+lines of data which is "cached" in registers
+near the chip outputs.
+Accessing data contained in these lines is typically
+faster than accessing data from the body of the DRAM
+chip.
+The set of memory contained in a bank of DRAM chips
+for a single line (per DRAM chip) of memory is usually
+called a DRAM page.
+.LP
+Recently some systems have started taking advantage
+of this potential performance increase by keeping
+DRAM pages "open" (in the register bank) after an
+access in the hope that the next access will be
+to the same page.
+This means that main memory latency suddenly 
+depends on the access history, and that dramatic
+differences in "open" versus "closed" DRAM page
+performance may impact software and data structure
+design.
+.LP
+To measure DRAM page latency, we need to compare
+performance for accesses to "open" versus "closed"
+DRAM pages.
+The standard pointer chain developed for measuring
+cache and memory latency maximizes "open" DRAM page
+accesses while minimizing other overheads, such as
+TLB misses.
+This means that we need to develop another pointer
+chain which maximizes "closed" DRAM accesses while
+still minimizing other overheads such as TLB misses.
+.LP
+This can be done by clustering pages into \fIgroups\fP
+whose size is smaller than the TLB size.
+Within each group the pointer chain switches pages
+on each access to maximize the probability of a "closed" 
+DRAM page access.
+For all but the last page in the group, each access
+points to the same location within the page, except
+on the next page.
+The last page points to the next location in the
+first page, using the same location bit-switching
+selection logic used in the standard pointer chain.
+.NH 1
+Conclusion
+.LP
+XXX Update conclusions
+\*[lmbench] is a useful, portable micro-benchmark suite designed to
+measure important aspects of system performance.   We have found that a good
+memory subsystem is at least as important as the processor speed.
+As processors get faster and faster, more and more of the system design
+effort will need to move to the cache and memory subsystems.
+.NH 1
+Acknowledgments
+.LP
+Many people have provided invaluable help and insight into both the
+benchmarks themselves and the paper.  The \s-1USENIX\s0 reviewers
+were especially helpful.
+We thank all of them and especially thank:
+Wayne Scott \s-1(BitMover)\s0,
+Larry McVoy \s-1(BitMover)\s0,
+Bruce Chapman \s-1(SUN)\s0,
+and
+John McCalpin \s-1(Univ. of Virginia)\s0.
+.LP
+We would also like to thank all of the people that have 
+run the benchmark and contributed their results; none of 
+this would have been possible without their assistance.
+.NH 1
+Obtaining the benchmarks
+.LP
+The benchmarks are available at:
+.QP
+\fIhttp://ftp.bitmover.com/lmbench\fP
+.ft
+.\" .R1
+.\" bibliography references-memhier
+.\" .R2
+.\"********************************************************************
+.\" Redefine the IP paragraph format so it won't insert a useless line
+.\" break when the paragraph tag is longer than the indent distance
+.\"
+.de @IP
+.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2)
+.par*start \\n[\\n[.ev]:ai] 0
+.if !'\\$1'' \{\
+.	\" Divert the label so as to freeze any spaces.
+.	di par*label
+.	in 0
+.	nf
+\&\\$1
+.	di
+.	in
+.	fi
+.	chop par*label
+.	ti -\\n[\\n[.ev]:ai]u
+.	ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c
+.	el \{\
+\\*[par*label]
+.\".	br
+.	\}
+.	rm par*label
+.\}
+..
+.\"********************************************************************
+.\" redefine the way the reference tag is printed so it is enclosed in
+.\" square brackets
+.\"
+.de ref*end-print
+.ie d [F .IP "[\\*([F]" 2
+.el .XP
+\\*[ref*string]
+..
+.\"********************************************************************
+.\" Get journal number entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-N
+.ref*field N "" ( ) 
+..
+.\"********************************************************************
+.\" Get journal volume entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-V
+.ref*field V , "" "" ""
+..
+.\"********************************************************************
+.\" Get the date entry right.  Should not be enclosed in parentheses.
+.\"
+.de ref*add-D
+.ref*field D ","
+..
+.R1
+accumulate
+sort A+DT
+database references-memhier
+label-in-text
+label A.nD.y-2
+bracket-label [ ] ", "
+bibliography references-memhier
+.R2
+.\" .so bios
diff --git a/performance/lmbench3/doc/mhz.8 b/performance/lmbench3/doc/mhz.8
new file mode 100644
index 0000000..b9cd1b7
--- /dev/null
+++ b/performance/lmbench3/doc/mhz.8
@@ -0,0 +1,29 @@
+.\" $Id: mhz.8 1.3 00/10/16 17:13:52+02:00 staelin@xxxxxxxxxxxxxxxxxxxxx $
+.TH MHZ 8 "$Date: 00/10/16 17:13:52+02:00 $" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+mhz \- calulate processor clock rate
+.SH SYNOPSIS
+.B mhz
+.I [-c]
+.SH DESCRIPTION
+.B mhz
+calculates the processor clock rate and megahertz.  It uses an
+unrolled, interlocked loop of adds or shifts.  So far, superscalarness
+has been defeated on the tested processors (SuperSPARC, RIOS, Alpha).
+.SH OUTPUT
+Output format is either just the clock rate as a float (-c) or more verbose
+.sp
+.ft CB
+39.80 Mhz, 25 nanosec clock
+.ft
+.LP
+.B mhz
+is described more completely in ``mhz: Anatomy of a microbenchmark''
+in
+.I "Proceedings of 1998 USENIX Annual Technical Conference", June 1998.
+.SH "SEE ALSO"
+lmbench(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/par_mem.8 b/performance/lmbench3/doc/par_mem.8
new file mode 100644
index 0000000..0844f55
--- /dev/null
+++ b/performance/lmbench3/doc/par_mem.8
@@ -0,0 +1,68 @@
+.\" $Id$
+.TH PAR_MEM 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+par_mem \- memory parallelism benchmark
+.SH SYNOPSIS
+.B par_mem
+[
+.I "-L <line size>"
+]
+[
+.I "-M <len>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B par_mem
+measures the available parallelism in the memory hierarchy, up to
+.I len
+bytes.  Modern processors can often service multiple memory requests
+in parallel, while older processors typically blocked on LOAD
+instructions and had no available parallelism (other than that
+provided by cache prefetching).  
+.B par_mem
+measures the available parallelism at a variety of points, since the
+available parallelism is often a function of the data location in the
+memory hierarchy.
+.LP
+In order to measure the available parallelism
+.B par_mem
+conducts a variety of experiments at each memory size; one for each
+level of parallelism.  It builds a pointer chain of the desired
+length.  It then creates an array of pointers which point to chain
+entries which are evenly spaced across the chain.  Then it starts
+running the pointers forward through the chain in parallel.  It can
+then measure the average memory latency for each level of parallelism,
+and the available parallelism is the minimum average memory latency
+for parallelism 1 divided by the average memory latency across all
+levels of available parallelism.
+.LP
+For example, the inner loop which measures parallelism 2 would look
+something like:
+.sp
+.ft CB
+	p0 = (char **)*p0;
+	p1 = (char **)*p1;
+.ft
+.sp
+in a 
+.I for 
+loop (the over head of the 
+.I for 
+loop is not significant; the loop is an unrolled loop 100 loads long).  
+.SH OUTPUT
+Output format is intended as input to \fBxgraph\fP or some similar program
+(we use a perl script that produces pic input).
+There is a set of data produced for each stride.  The data set title
+is the stride size and the data points are the array size in megabytes 
+(floating point value) and the load latency over all points in that array.
+.SH "SEE ALSO"
+lmbench(8), line(8), cache(8), tlb(8), par_ops(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/par_ops.8 b/performance/lmbench3/doc/par_ops.8
new file mode 100644
index 0000000..8327162
--- /dev/null
+++ b/performance/lmbench3/doc/par_ops.8
@@ -0,0 +1,39 @@
+.\" $Id$
+.TH PAR_OPS 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+par_ops \- basic CPU operation parallelism
+.SH SYNOPSIS
+.B par_ops
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B par_ops
+measures the available parallelism for basic CPU operations, such as
+integer ADD.  Results are reported as the average operation latency
+divided by the minimum average operation latency across all levels of
+parallelism. 
+.TP
+integer bit, add, mul, div, mod operations
+maximum parallelism for integer XOR, ADD, MUL, DIV, MOD operations.
+.TP
+uint64 bit, add, mul, div, mod operations
+maximum parallelism for uint64 XOR, ADD, MUL, DIV, MOD operations.
+.TP
+float add, mul, div operations
+maximum parallelism for flot ADD, MUL, DIV operations.
+.TP
+double add, mul, div operations
+maximum parallelism for flot ADD, MUL, DIV operations.
+.SH BUGS
+This benchmark is highly experimental and may sometimes (frequently?)
+give erroneous results.
+.SH "SEE ALSO"
+lmbench(8), lat_ops(8), par_mem(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/parallel.ms b/performance/lmbench3/doc/parallel.ms
new file mode 100755
index 0000000..b906446
--- /dev/null
+++ b/performance/lmbench3/doc/parallel.ms
@@ -0,0 +1,385 @@
+.\" This document is GNU groff -mgs -t -p -R -s
+.\" It will not print with normal troffs, it uses groff features, in particular,
+.\" long names for registers & strings.
+.\" Deal with it and use groff - it makes things portable.
+.\"
+.\" $X$ xroff -mgs -t -p -R -s $file
+.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more
+.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr
+.VARPS
+.\" Define a page top that looks cool
+.\" HELLO CARL!  To turn this off, s/PT/oldPT/
+.de draftPT
+.\" .tl '\fBDRAFT\fP'Printed \\*(DY'\fBDRAFT\fP'
+..
+.de lmPT
+.if \\n%>1 \{\
+.	sp -.1i
+.	ps 14
+.	ft 3
+.	nr big 24
+.	nr space \\w'XXX'
+.	nr titlewid \\w'\\*[title]'
+.	nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2
+.	ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25'
+.	ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0
+.	ce 1
+\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar]
+.	ps
+.	sp -.70
+.	ps 12
+\\l'\\n[LL]u'
+.	ft
+.	ps
+.\}
+..
+.\" Define a page bottom that looks cool
+.\" HELLO CARL!  To turn this off, s/BT/oldBT/
+.de draftBT
+.\" .tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP'
+..
+.de lmBT
+.	ps 9
+\v'-1'\\l'\\n(LLu'
+.	sp -1
+.	tl '\(co 2001 \\*[author]'\\*(DY'%'
+.	ps
+..
+.de SP
+.	if t .sp .5
+.	if n .sp 1
+..
+.de BU
+.	SP
+.	ne 2
+\(bu\ 
+.	if \\n[.$] \fB\\$1\fP\\$2
+..
+.nr FIGURE 0
+.nr TABLE 0
+.nr SMALL .25i
+.de TSTART
+.	KF
+.	if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0
+.	ps -1
+.	vs -1
+..
+.de TEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr TABLE \\n[TABLE]+1
+.	ce 1
+\fBTable \\n[TABLE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.de FEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr FIGURE \\n[FIGURE]+1
+.	ce 1
+\fBFigure \\n[FIGURE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.\" Configuration
+.nr PI 3n
+.nr HM 1i
+.nr FM 1i
+.nr PO 1i
+.if t .po 1i
+.nr LL 6.5i
+.if n .nr PO 0i
+.if n .nr LL 7.5i
+.nr PS 10
+.nr VS \n(PS+1
+.ds title Utilizing instruction-level parallelism
+.ds author Carl Staelin
+.ds lmbench \f(CWlmbench\fP
+.ds lmbench3 \f(CWlmbench3\fP
+.ds lmdd  \f(CWlmdd\fP
+.ds bcopy \f(CWbcopy\fP
+.ds connect \f(CWconnect\fP
+.ds execlp  \f(CWexeclp\fP
+.ds exit \f(CWexit\fP
+.ds fork \f(CWfork\fP
+.ds gcc \f(CWgcc\fP
+.ds getpid \f(CWgetpid\fP
+.ds getpid \f(CWgetpid\fP
+.ds gettimeofday \f(CWgettimeofday\fP
+.ds kill \f(CWkill\fP
+.ds memmove \f(CWmemmove\fP
+.ds mmap \f(CWmmap\fP
+.ds popen  \f(CWpopen\fP
+.ds read \f(CWread\fP
+.ds stream \f(CWstream\fP
+.ds system  \f(CWsystem\fP
+.ds uiomove \f(CWuiomove\fP
+.ds write \f(CWwrite\fP
+.ds yield  \f(CWyield\fP
+.ds select  \f(CWselect\fP
+.ds lat_ops  \f(CWlat_ops\fP
+.ds benchmp  \f(CWbenchmp\fP
+.ds lat_connect  \f(CWlat_connect\fP
+.\" References stuff
+.de RN  \"Reference Name: .RN $1 -- prints the reference prettily
+.\" [\s-2\\$1\s+2]\\$2
+[\s-1\\$1\s0]\\$2
+..
+.\" .R1
+.\" sort A+DT
+.\" database references
+.\" label-in-text
+.\" label A.nD.y-2
+.\" bracket-label \*([. \*(.] ", "
+.\" .R2
+.EQ
+delim $$
+.EN
+.TL
+\s(14Utilizing instruction-level parallelism\s0
+.AU
+\s+2\fR\*[author]\fP\s0
+.AI
+\fI\s+2Hewlett-Packard Laboratories Israel\s0\fP
+.SP
+.AB
+Modern processors and systems provide a great deal of 
+parallelism, even for traditional single-threaded
+software.  
+Often this parallelism is hidden, but the potential
+performance benefits of restructuring software to allow
+the hardware to utilize this parallelism can be striking.
+For example, modern memory systems can usually support
+at least two outstanding requests to main memory, and
+as many as six or eight outstanding requests to cache
+memory.  Since memory latencies can account for a
+significant fraction of many program's runtime, 
+restructuring data structures and algorithms so
+strictly sequential memory accesses can be 
+parallelized can greatly improve performance.
+.AE
+.if t .MC 3.05i
+.NH 1
+Introduction
+.LP
+Computer scientists are generally taught some basic computer
+architectecture and a set of standard data structures and
+algorithms, such as lists, hash tables, and binary search.  
+These data structures and algorithms are commonly used and
+in many programs their handling can consume a significant 
+fraction of the overal runtime.
+However, these data structures and algorithms were
+designed over thirty years ago, when most processors had
+no parallelism.
+.LP
+There has been a great deal of work by compiler writers
+and computer architects on automatically discovering and
+utilizing instruction-level parallelism in existing
+software, but relatively little work has been done on
+examining data structures and algorithms that can enable
+increased instruction-level parallelism.
+.LP
+There has been a great deal of work focussing on 
+developing parallel algorithms for multi-processor
+machines, with explicit synchronization primitives
+such as semaphores and barriers.  At this level of
+parallelism, the overheads are generally so high
+that the parallelism must be fairly coarse-grained,
+or else the overhead costs consume any benefits
+provided by the parallelism.
+.LP
+However, instruction-level parallelism is "free"; it
+is managed by the hardware and incurs no additional
+runtime costs.  
+The main question is how to structure software algorithms
+and data structures to maximize the available parallelism.
+.NH 1
+Prior work
+.LP
+Over the last few years, there has been some work on
+improving the performance of critical software in a
+architecture-sensitive manner.  
+.LP
+.RN Agarwal96
+describes the design and implementation of a 
+fast sorting algorithm for superscalar RISC machines.
+.LP
+The Automatically Tuned Linear Algebra System (ATLAS)
+.RN Whaley98
+contains a number of parametrized code generators
+for matrix multiply operations, as well as a pluggable
+architecture to allow developers to add hardware-specific
+modules.
+ATLAS then explores the parameter space to find the
+optimal parameter settings for the particular system.
+.LP
+FFTW
+.RN Frigo98
+is another project which uses architecture-aware
+optimizations.
+.NH 1
+Computer architecture primer
+.LP
+A processor architecture is generally defined by its
+instruction set, but most computer architectures
+incorporate a large number of common building blocks
+and concepts, such as registers, arithmetic logic
+units, and caches.
+.NH 2
+Traditional architecture
+.LP
+One view of a traditional architecture might be the
+MIX system defined by Knuth in his classic work on
+algorithms and data structures
+.RN Knuth73 .
+While the MIX instruction set and architecture does
+not forbid parallelism, there is no explicit parallelism 
+mentioned in the description.  
+Consequently, none of the algorithms assumes any
+instruction-level parallelism, or is structured to
+explicitly utilize such parallelism had it existed.
+.LP
+The MIX system has a single arithmetic logic unit,
+and no floating point unit, so there is no explicit
+instruction-level parallelism specified in the 
+architecture.
+.NH 2
+Modern Extensions
+.LP
+There are a number of modern extensions to computer
+architecture that attempt to increase the processor's
+ability to do several things at once.  Nearly all of
+these enhancements, with the notable exception of
+the EPIC work, are intended to be invisible to the
+average programmer.  Most notably, they do not require
+changing the instruction set.
+.IP "Superscalar processors"
+Superscalar processors have multiple processing
+units which can operate simultaneously.  
+.IP "Dynamic instruction reordering"
+Dynamic instruction reordering allows the processor
+to execute instructions whose operands are ready
+before instructions which are stalled waiting for
+memory or other instruction's completion.
+.IP "Memory parallelism"
+By allowing multiple outstanding memory requests,
+processors allow the memory subsystem to service
+multiple (independent) requests in parallel. 
+Since memory accesses are a common performance
+bottleneck, this can greatly improve performance.
+.IP "Vector processing"
+Vector processing allows the processor to execute
+arithmetic operations on vector operands in 
+parallel, and in modern commodity processors goes
+by names such as MMX, SSE, and 3DNow.
+.IP "Simultaneous multi-threading (SMT)"
+SMT allows superscalar processors to simulatenously
+execute instructions from several threads (contexts)
+.RN Tullset96 .
+SMT may include extensions which allow for very
+lightweight inter-thread synchronization primitives
+that enable much finer-grained thread-level 
+parallelism than traditional synchronization
+methods
+.RN Tullsen99 .
+.IP "Explicitly parallel instruction computers (EPIC)"
+EPIC allows the compiler to explicitly issue $N$
+instructions in parallel at each instruction, which
+informs the hardware that these instructions are
+independent and may be executed in parallel
+.RN Schlansker00 .
+It moves much of the burden regarding dependency
+checking from the hardware to the compiler.
+.NH 1
+Conclusion
+.LP
+With the increasing proliferation of both explicit and
+hidden parallelism in processor and memory system
+designs, it is becoming important to revisit many data 
+structures and algorithms to adapt them to the new 
+hardware environment.
+.NH 1
+Acknowledgments
+.LP
+Many people have provided invaluable help and insight into both the
+benchmarks themselves and the paper.  We thank all of them
+and especially thank Larry McVoy \s-1(BitMover)\s0 for the
+lively conversations and discussions regarding benchmarking
+and experimental design.
+.\" .R1
+.\" bibliography references-parallel
+.\" .R2
+.\"********************************************************************
+.\" Redefine the IP paragraph format so it won't insert a useless line
+.\" break when the paragraph tag is longer than the indent distance
+.\"
+.de @IP
+.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2)
+.par*start \\n[\\n[.ev]:ai] 0
+.if !'\\$1'' \{\
+.	\" Divert the label so as to freeze any spaces.
+.	di par*label
+.	in 0
+.	nf
+\&\\$1
+.	di
+.	in
+.	fi
+.	chop par*label
+.	ti -\\n[\\n[.ev]:ai]u
+.	ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c
+.	el \{\
+\\*[par*label]
+.\".	br
+.	\}
+.	rm par*label
+.\}
+..
+.\"********************************************************************
+.\" redefine the way the reference tag is printed so it is enclosed in
+.\" square brackets
+.\"
+.de ref*end-print
+.ie d [F .IP "[\\*([F]" 2
+.el .XP
+\\*[ref*string]
+..
+.\"********************************************************************
+.\" Get journal number entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-N
+.ref*field N "" ( ) 
+..
+.\"********************************************************************
+.\" Get journal volume entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-V
+.ref*field V , "" "" ""
+..
+.\"********************************************************************
+.\" Get the date entry right.  Should not be enclosed in parentheses.
+.\"
+.de ref*add-D
+.ref*field D ","
+..
+.R1
+accumulate
+sort A+DT
+database references-parallel
+label-in-text
+label A.nD.y-2
+bracket-label [ ] ", "
+bibliography references-parallel
+.R2
+.\" .so bios
diff --git a/performance/lmbench3/doc/pgraph.1 b/performance/lmbench3/doc/pgraph.1
new file mode 100644
index 0000000..562a58a
--- /dev/null
+++ b/performance/lmbench3/doc/pgraph.1
@@ -0,0 +1,155 @@
+.\" $Id: pgraph.1 1.3 95/11/29 11:54:39-08:00 lm@xxxxxxxxxxxxxxx $
+.de DS
+.	sp .5
+.	nf
+.	in +4
+.	ft CW
+.	vs -1
+..
+.de DE
+.	sp .5
+.	fi
+.	in
+.	ft
+.	vs
+..
+.TH PGRAPH 1 "Nov, 1995" "lm@xxxxxxx" "Docomentation tools"
+.SH NAME
+pgraph \- compile graphs into pic input
+.SH SYNOPSIS
+.B pgraph
+[ options ]
+[
+.I filename
+\&.\|.\|.
+]
+.SH DESCRIPTION
+.LP
+.B pgraph
+is a perl script which
+takes sets of X Y data and generates a (human readable) pic program
+that will produce the graphed data.  The output is designed such that
+you can save it in a file and tweak it to make it fit your document.
+Try one and look at the output.  The output is actually commented.
+.LP
+The graph is autosized and auto ticked.
+.LP
+The input data format is similar
+that of xgraph(1), i.e.,
+.DS
+"sloped across
+1 1
+2 2
+3 3
+
+"straight across
+1 4
+2 4
+3 4
+.DE
+.SH "CONTROL OPTIONS"
+.LP
+You may set the graph title, the X title, and the Y title with the 
+following control sequences in the data stream:
+.DS
+%T Graph title in +4 point font
+%X X axis title and/or units in +2 point font
+%Y Y axis title and/or units in +2 point font
+%fakemax-X <value>     force graph to be that big
+%fakemax-Y <value>     force graph to be that big
+%fakemin-X <value>     force graph to be that small
+%fakemin-Y <value>     force graph to be that small
+.DE
+.SH OPTIONS
+.IP -rev 12
+reverse X/Y data sense (and titles).  Note this is done after processing
+any fudging of the input data stream(s) (see -xk, -yk, -logx, etc below).
+.IP -below
+put data set titles below the graph rather than to the right.
+.IP -close
+no extra space around the data's endpoints.
+.IP -qline
+connect the quartile center points.
+.IP -grid
+dotted line grid marks.
+.IP -nobox
+no box around whole graph.
+.IP -big
+make the graph take the whole page, and be about 8 inches tall by 7 inches
+wide and the title is +8 points.
+.IP -slide
+make the graph be 4.25 inches square to fit in slides,
+in a helvetica bold 10 point font.
+.IP -small
+make the graph be small, 1.75 inches square, and use an 8 point bold font.
+.IP -grapheach
+draw each data set in its own graph.
+.IP -nolabels
+no X/Y/Title labels.
+.IP -notitle
+no Title label.
+.IP -nodatal
+no data set labels.
+.IP -nomarks
+do not mark each data point with distinct markers (endpoints are still
+marked).
+.IP -k
+print values larger than 1000 as value/1000.
+.IP -xk
+multiply X input by 1024 (blech).
+.IP -yk
+multiply Y input by 1024 (blech).
+.IP -xm
+multiply X input by 1024*1024 (blech).
+.IP -ym
+multiply Y input by 1024*1024 (blech).
+.IP -logx
+convert X input into log base 2 of X input.
+.IP -logy
+convert Y input into log base 2 of Y input.
+.SH EXAMPLE
+Workstation price performance from a Digital ad.  Process with
+.DS
+.ps -2
+graph -rev workstations | groff -TX75
+
+%T Workstation Price / Performance, 6/93
+%X SPECINT 92 Performance
+%Y Price in $1000's
+"Dec AXP line
+35 5
+65 10
+78 15
+110 70
+
+"Sun SPARC line
+25 4
+25 8
+38 16
+48 21
+52 23
+64 27
+.DE
+.ps
+.SH "QUARTILE FORMAT"
+Data points are \f(CBx y1 y2 y3 y4 y5\fP.   You get a two lines from the
+first two y values, a mark at the third, and another line from the last two.
+.SH "SEE ALSO"
+.BR gtroff (1),
+.BR gpic (1),
+.BR perl (1).
+.SH BUGS
+-grapheach assumes the set of N graphs will fit on one page.
+.LP
+Since it is just a simple perl script, I tend to be constantly adding
+one more feature on the fly.  Consult the script for the latest set of
+options.  Development is typically done by using the closest set of options
+to generate the graph, massage the graph to do what you want, then add that
+set of changes as a new option.
+.LP
+This isn't done as much as I would like.
+It isn't integrated with the groff preprocessor yet.
+It doesn't know about .GS/.GE things.  I use it to manually generate
+a pic file and then include that.
+.LP
+I need to include some example data sets with pgraph.
diff --git a/performance/lmbench3/doc/rccs.1 b/performance/lmbench3/doc/rccs.1
new file mode 100644
index 0000000..7bbdf52
--- /dev/null
+++ b/performance/lmbench3/doc/rccs.1
@@ -0,0 +1,149 @@
+.\" $Id: rccs.1 1.1 95/11/29 12:52:04-08:00 lm@xxxxxxxxxxxxxxx $
+.de DS
+.	sp .5
+.	nf
+.	in +4
+.	ft CW
+.	vs -1
+..
+.de DE
+.	sp .5
+.	fi
+.	in
+.	ft
+.	vs
+..
+.TH RCCS 1 "Nov, 1995" "lm@xxxxxxx" "Programmers tools"
+.SH NAME
+rccs \- apply RCS commands to sets of files
+.SH SYNOPSIS
+.B rccs
+command
+[ options ]
+[
+.I filename
+and/or
+.I directory
+\&.\|.\|.
+]
+.SH DESCRIPTION
+.LP
+.B rccs
+is a perl script that tries to emulate the Berkeley \fBSCCS\fP program
+for \fBRCS\fP.  If your fingers know how to type commands to \fBSCCS\fP,
+just do the same thing to \fBrccs\fP.
+.LP
+A subset of the \fBSCCS\fP commands are implemented, the ones that I use.
+Some new commands have been added.  It is easy to add more commands, see
+the \fIExample\fP routine at the bottom of \fBrccs\fP to see how.
+.LP
+This interface does not require a list of files/directories for most
+commands; the implied list is *,v and/or RCS/*,v.  Destructive commands,
+such as clean -f, unedit, unget, do \fBnot\fP have an implied list.  In
+other words, \f(CBrccs diffs\fP is the same as \f(CBrccs diffs RCS\fP
+but \f(CBrccs unedit\fP is not the same as \f(CBrccs unedit RCS\fP.
+.SH COMMANDS
+.IP options 8
+Note that RCS options are typically passed through to RCS.  The options
+that made sense to SCCS commands are translated to RCS options.
+.IP "ci" 10
+Alias for delta.  Checks in files.
+.IP "clean [-e] [-f] [-d|y'message'] [files]"
+Without any arguments, this command removes all files that are read only
+and have an associated RCS file.
+With the -e argument, clean removes files that have been checked out
+writable but have not been modified.
+The -d|y|m option may be combined with -e to check in the set of files that
+have been modified.
+With the -f option, clean removes all working files, \fBincluding\fP files
+that have been modified since the check out.  Be careful.
+.IP co 
+Alias for get.  Checks out files.
+.IP "create [-y|d'message'] [-g] files"
+Initial check in of files to the RCS system.  The files are then checked out
+readonly unless the -g option is present.
+The -y or -d options may be used to set the descriptive text message.
+Differs from SCCS in that the
+original files are not preserved.
+.IP deledit 
+Alias for delta followed by a get -e.
+.IP delget 
+Alias for delta followed by a get.
+.IP "delta [-y|d'message'] [-q] [files]"
+Check in a delta of the file.  -q is changed to RCS' -s and means to be 
+quiet about hwat is happening.  -y'message' or -d'message' or -m'message'
+all get sent through to RCS as the check in message.  No other arguments
+are translated.
+.IP "diffs [-C|c] [-r<rev>] [-sdiff] [files]"
+Shows changes between the working files and the RCS file.  Note that the 
+files do not need to be checked out, only writable.  -C or -c means do a 
+context diff.  -sdiff means do a side by side diff.  The sdiff option will
+figure out your screen width if it knows how - see the source to make this 
+work on your system.
+.IP edit 
+Alias for get -e.
+.IP enter 
+Alias for create -g.
+.IP fix 
+Useful if you just checked in the file and then realized you forgot 
+something.  The fix command will remove the top delta from the history
+and leave you with an editable working file with the top delta as the
+contents.
+.IP "get [-e] [-p] [-k] [-s] [files]"
+Get, or check out, the file.  Without any options, get just gets the 
+latest revision of the RCS file in the working file.
+With -e, check out the file writable.  With -p, send the file to stdout.
+With -k, supress expansion of key words.  With -s, be quiet about what
+is happening.
+.IP help 
+Get a brief help screen of information.
+.IP "history [files]"
+Print the RCS history (my format) of the specified files.
+.IP "info [files]"
+Print the list of files being edited.
+.IP print 
+Alias for a loop that prints the history of each file followed by the
+contents of the file.
+.IP prs 
+Alias for history.
+.IP prt 
+Alias for history.
+.IP unedit 
+Alias for clean -f.
+.IP unget 
+Alias for clean -f.
+.SH GLOBAL OPTIONS
+.IP -debug 10
+Turn on debugging.  Used when debugging \fBrccs\fP itself.
+.IP -verbose
+Be more verbose about what is happening.
+.SH EXAMPLES
+To start off, add a bunch of files to RCS:
+.DS
+rccs create -y'my program name' myprog.c myprog.h
+.DE
+Now let's edit them all:
+.DS
+rccs get -e
+.DE
+If we didn't change anything, the following gives us a clean directory:
+.DS
+rccs clean -e 
+.DE
+If we changed myprog.h, the following gives us a clean directory after
+checking in myprog.h:
+.DS
+rccs clean -e -d'some message'
+.DE
+If we want to see what we changed:
+.DS
+rccs diffs
+.DE
+.SH "SEE ALSO"
+.BR "RCS commands" ,
+.BR "SCCS commands" ,
+.BR sdiff (1),
+.BR perl (1).
+.SH TODO
+It would be nice to implement a \fB-i\fP option that prompted before each 
+action, especially the destructive ones.
diff --git a/performance/lmbench3/doc/refdbms.keys b/performance/lmbench3/doc/refdbms.keys
new file mode 100644
index 0000000..ff4ab8d
--- /dev/null
+++ b/performance/lmbench3/doc/refdbms.keys
@@ -0,0 +1,20 @@
+Chen93d
+Chen94a
+Fenwick95
+Howard88
+Jain91
+McCalpin95
+Ousterhout90
+Park90
+Smith82b
+Smith85
+Wolman89
+Wong88
+Agarwal95
+Bailey93
+Bitton83
+Chen91b
+Dietrich92
+Leutenegger93
+Nelson89
+TPPC92
diff --git a/performance/lmbench3/doc/references b/performance/lmbench3/doc/references
new file mode 100644
index 0000000..03167aa
--- /dev/null
+++ b/performance/lmbench3/doc/references
@@ -0,0 +1,186 @@
+%z Article
+%K Saavedra95
+%A R.H. Saavedra
+%A A.J. Smith
+%T Measuring cache and TLB performance and their effect on benchmark runtimes
+%J IEEE Transactions on Computers
+%V 44
+%N 10
+%D October 1995
+%P 1223-1235
+
+%z Article
+%K Wolman89
+%A Barry L. Wolman
+%A Thomas M. Olson
+%T IOBENCH: a system independent IO benchmark
+%J Computer Architecture News
+%V 17
+%N 5
+%D September 1989
+%P 55-70
+%x IOBENCH is an operating system and processor independent synthetic
+%x input/output (IO) benchmark designed to put a configurable IO and
+%x processor (CP) load on the system under test.  This paper discusses
+%x the UNIX versions.
+%k IOBENCH, synthetic I/O benchmark, UNIX workload
+%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991)
+
+%z Book
+%K Hennessy96
+%A John L. Hennessy
+%A David A. Patterson
+%T Computer Architecture A Quantitative Approach, 2nd Edition
+%I Morgan Kaufman
+%D 1996
+
+%z Article
+%K Chen94a
+%A P. M. Chen
+%A D. A. Patterson
+%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance
+%D November 1994
+%J Transactions on Computer Systems
+%V 12
+%N 4
+%P 308-339
+%x Current I/O benchmarks suffer from several chronic problems: they
+%x quickly become obsolete; they do not stress the I/O system; and they
+%x do not help much in undelsi;anding I/O system performance. We
+%x propose a new approach to I/O performance analysis. First, we
+%x propose a self-scaling benchmark that dynamically adjusts aspects of
+%x its workload according to the performance characteristic of the
+%x system being measured. By doing so, the benchmark automatically
+%x scales across current and future systems. The evaluation aids in
+%x understanding system performance by reporting how performance varies
+%x according to each of five workload parameters. Second, we propose
+%x predicted performance, a technique for using the results from the
+%x self-scaling evaluation to estimate quickly the performance for
+%x workloads that have not been measured. We show that this technique
+%x yields reasonably accurate performance estimates and argue that this
+%x method gives a far more accurate comparative performance evaluation
+%x than traditional single-point benchmarks. We apply our new
+%x evaluation technique by measuring a SPARCstation 1+ with one SCSI
+%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running
+%x the Sprite LFS operating system with a three-disk disk array, a
+%x Convex C240 minisupercomputer with a four-disk disk array, and a
+%x Solbourne 5E/905 fileserver with a two-disk disk array.
+%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995)
+%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995)
+%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995)
+
+%z InProceedings
+%K Ousterhout90
+%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990)
+%A John K. Ousterhout
+%T Why aren't operating systems getting faster as fast as hardware?
+%C Proceedings USENIX Summer Conference
+%c Anaheim, CA
+%D June 1990
+%P 247-256
+%x This paper evaluates several hardware pplatforms and operating systems using
+%x a set of benchmarks that stress kernel entry/exit, file systems, and
+%x other things related to operating systems. The overall conclusion is that
+%x operating system performance is not improving at the same rate as the base speed of the
+%x underlying hardware. The most obvious ways to remedy this situation
+%x are to improve memory bandwidth and reduce operating systems'
+%x tendency to wait for disk operations to complete.
+%o Typical performance of 10-20 MIPS cpus is only 0.4 times what
+%o their raw hardware performance would suggest. HP-UX is
+%o particularly bad on the HP 9000/835, at about 0.2x. (Although
+%o this measurement discounted a highly-tuned getpid call.)
+%k OS performance, RISC machines, HP9000 Series 835 system calls
+
+%z InProceedings
+%K McVoy91
+%A L. W. McVoy
+%A S. R. Kleiman
+%T Extent-like Performance from a Unix File System
+%C Proceedings USENIX Winter Conference
+%c Dallas, TX
+%D January 1991
+%P 33-43
+
+%z Article
+%K Chen93d
+%A Peter M. Chen
+%A David Patterson
+%T Storage performance \- metrics and benchmarks
+%J Proceedings of the IEEE
+%V 81
+%N 8
+%D August 1993
+%P 1151-1165
+%x Discusses metrics and benchmarks used in storage performance evaluation.
+%x Describes, reviews, and runs popular I/O benchmarks on three systems.  Also
+%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling
+%x benchmark with predicted performance.
+%k I/O, storage, benchmark, workload, self-scaling benchmark, 
+%k predicted performance, disk, performance evaluation
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995)
+
+%z Article
+%K Park90a
+%A Arvin Park
+%A J. C. Becker
+%T IOStone: a synthetic file system benchmark
+%J Computer Architecture News
+%V 18
+%N 2
+%D June 1990
+%P 45-52
+%o this benchmark is useless for all modern systems; it fits
+%o completely inside the file system buffer cache.  Soon it may even
+%o fit inside the processor cache!
+%k IOStone, I/O, benchmarks
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995)
+
+%z Article
+%K Fenwick95
+%A David M. Fenwick
+%A Denis J. Foley
+%A William B. Gist
+%A Stephen R. VanDoren
+%A Danial Wissell
+%T The AlphaServer 8000 series: high-end server platform development
+%J Digital Technical Journal
+%V 7
+%N 1
+%D August 1995
+%P 43-65
+%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end
+%x server products.  Both servers are based on the 300Mhz Alpha 21164 
+%x microprocessor and on the AlphaServer 8000-series platform architecture.
+%x The AlphaServer 8000 platform development team set aggressive system data
+%x bandwidth and memory read latency targets in order to achieve high-performance
+%x goals.  The low-latency criterion was factored into design decisions made at
+%x each of the seven layers of platform development.  The combination of 
+%x industry-leading microprocessor technology and a system platform focused
+%x on low latency has resulted in a 12-processor server implementation ---
+%x the AlphaServer 8400 --- capable of supercomputer levels of performance.
+%k DEC Alpha server, performance, memory latency
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995)
+
+%z Book
+%K Toshiba94
+%A Toshiba
+%T DRAM Components and Modules
+%I Toshiba America Electronic Components, Inc.
+%P A59-A77,C37-C42
+%D 1994
+
+%z Article
+%K McCalpin95
+%A John D. McCalpin
+%T Memory bandwidth and machine balance in current high performance computers
+%J IEEE Technical Committee on Computer Architecture newsletter
+%V to appear 
+%D December 1995
+
+%z Article
+%K FSF89
+%A Richard Stallman
+%Q Free Software Foundation
+%T General Public License
+%D 1989
+%O Included with \*[lmbench]
diff --git a/performance/lmbench3/doc/references- b/performance/lmbench3/doc/references-
new file mode 100644
index 0000000..6f18ced
--- /dev/null
+++ b/performance/lmbench3/doc/references-
@@ -0,0 +1,175 @@
+%z Article
+%K Wolman89
+%A Barry L. Wolman
+%A Thomas M. Olson
+%T IOBENCH: a system independent IO benchmark
+%J Computer Architecture News
+%V 17
+%N 5
+%D September 1989
+%P 55-70
+%x IOBENCH is an operating system and processor independent synthetic
+%x input/output (IO) benchmark designed to put a configurable IO and
+%x processor (CP) load on the system under test.  This paper discusses
+%x the UNIX versions.
+%k IOBENCH, synthetic I/O benchmark, UNIX workload
+%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991)
+
+%z Book
+%K Hennessy96
+%A John L. Hennessy
+%A David A. Patterson
+%T Computer Architecture A Quantitative Approach, 2nd Edition
+%I Morgan Kaufman
+%D 1996
+
+%z Article
+%K Chen94a
+%A P. M. Chen
+%A D. A. Patterson
+%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance
+%D November 1994
+%J Transactions on Computer Systems
+%V 12
+%N 4
+%P 308-339
+%x Current I/O benchmarks suffer from several chronic problems: they
+%x quickly become obsolete; they do not stress the I/O system; and they
+%x do not help much in undelsi;anding I/O system performance. We
+%x propose a new approach to I/O performance analysis. First, we
+%x propose a self-scaling benchmark that dynamically adjusts aspects of
+%x its workload according to the performance characteristic of the
+%x system being measured. By doing so, the benchmark automatically
+%x scales across current and future systems. The evaluation aids in
+%x understanding system performance by reporting how performance varies
+%x according to each of five workload parameters. Second, we propose
+%x predicted performance, a technique for using the results from the
+%x self-scaling evaluation to estimate quickly the performance for
+%x workloads that have not been measured. We show that this technique
+%x yields reasonably accurate performance estimates and argue that this
+%x method gives a far more accurate comparative performance evaluation
+%x than traditional single-point benchmarks. We apply our new
+%x evaluation technique by measuring a SPARCstation 1+ with one SCSI
+%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running
+%x the Sprite LFS operating system with a three-disk disk array, a
+%x Convex C240 minisupercomputer with a four-disk disk array, and a
+%x Solbourne 5E/905 fileserver with a two-disk disk array.
+%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995)
+%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995)
+%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995)
+
+%z InProceedings
+%K Ousterhout90
+%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990)
+%A John K. Ousterhout
+%T Why aren't operating systems getting faster as fast as hardware?
+%C Proceedings USENIX Summer Conference
+%c Anaheim, CA
+%D June 1990
+%P 247-256
+%x This paper evaluates several hardware pplatforms and operating systems using
+%x a set of benchmarks that stress kernel entry/exit, file systems, and
+%x other things related to operating systems. The overall conclusion is that
+%x operating system performance is not improving at the same rate as the base speed of the
+%x underlying hardware. The most obvious ways to remedy this situation
+%x are to improve memory bandwidth and reduce operating systems'
+%x tendency to wait for disk operations to complete.
+%o Typical performance of 10-20 MIPS cpus is only 0.4 times what
+%o their raw hardware performance would suggest. HP-UX is
+%o particularly bad on the HP 9000/835, at about 0.2x. (Although
+%o this measurement discounted a highly-tuned getpid call.)
+%k OS performance, RISC machines, HP9000 Series 835 system calls
+
+%z InProceedings
+%K McVoy91
+%A L. W. McVoy
+%A S. R. Kleiman
+%T Extent-like Performance from a Unix File System
+%C Proceedings USENIX Winter Conference
+%c Dallas, TX
+%D January 1991
+%P 33-43
+
+%z Article
+%K Chen93d
+%A Peter M. Chen
+%A David Patterson
+%T Storage performance \- metrics and benchmarks
+%J Proceedings of the IEEE
+%V 81
+%N 8
+%D August 1993
+%P 1151-1165
+%x Discusses metrics and benchmarks used in storage performance evaluation.
+%x Describes, reviews, and runs popular I/O benchmarks on three systems.  Also
+%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling
+%x benchmark with predicted performance.
+%k I/O, storage, benchmark, workload, self-scaling benchmark, 
+%k predicted performance, disk, performance evaluation
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995)
+
+%z Article
+%K Park90a
+%A Arvin Park
+%A J. C. Becker
+%T IOStone: a synthetic file system benchmark
+%J Computer Architecture News
+%V 18
+%N 2
+%D June 1990
+%P 45-52
+%o this benchmark is useless for all modern systems; it fits
+%o completely inside the file system buffer cache.  Soon it may even
+%o fit inside the processor cache!
+%k IOStone, I/O, benchmarks
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995)
+
+%z Article
+%K Fenwick95
+%A David M. Fenwick
+%A Denis J. Foley
+%A William B. Gist
+%A Stephen R. VanDoren
+%A Danial Wissell
+%T The AlphaServer 8000 series: high-end server platform development
+%J Digital Technical Journal
+%V 7
+%N 1
+%D August 1995
+%P 43-65
+%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end
+%x server products.  Both servers are based on the 300Mhz Alpha 21164 
+%x microprocessor and on the AlphaServer 8000-series platform architecture.
+%x The AlphaServer 8000 platform development team set aggressive system data
+%x bandwidth and memory read latency targets in order to achieve high-performance
+%x goals.  The low-latency criterion was factored into design decisions made at
+%x each of the seven layers of platform development.  The combination of 
+%x industry-leading microprocessor technology and a system platform focused
+%x on low latency has resulted in a 12-processor server implementation ---
+%x the AlphaServer 8400 --- capable of supercomputer levels of performance.
+%k DEC Alpha server, performance, memory latency
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995)
+
+%z Book
+%K Toshiba94
+%A Toshiba
+%T DRAM Components and Modules
+%I Toshiba America Electronic Components, Inc.
+%P A59-A77,C37-C42
+%D 1994
+
+%z Article
+%K McCalpin95
+%A John D. McCalpin
+%T Memory bandwidth and machine balance in current high performance computers
+%J IEEE Technical Committee on Computer Architecture newsletter
+%V to appear 
+%D December 1995
+
+%z Article
+%K FSF89
+%A Richard Stallman
+%Q Free Software Foundation
+%T General Public License
+%D 1989
+%O Included with \*[lmbench]
diff --git a/performance/lmbench3/doc/references-lmbench3 b/performance/lmbench3/doc/references-lmbench3
new file mode 100644
index 0000000..3f70416
--- /dev/null
+++ b/performance/lmbench3/doc/references-lmbench3
@@ -0,0 +1,430 @@
+%z Article
+%K Staelin98
+%A Carl Staelin
+%A Larry McVoy
+%T mhz: Anatomy of a microbenchmark
+%B Proceedings USENIX Annual Technical Conference
+%C New Orleans, LA
+%D June 1998
+%P 155-166
+
+%z Article
+%K McVoy96
+%A Larry McVoy
+%A Carl Staelin
+%T lmbench: Portable tools for performance analysis
+%B Proceedings USENIX Winter Conference
+%C San Diego, CA
+%D January 1996
+%P 279-284
+
+%K Bray90
+%A Tim Bray
+%T Bonnie benchmark
+%D 1990
+%o http://www.textuality.com/bonnie/
+
+%z Article
+%K Brown97
+%A Aaron Brown
+%A Margo Seltzer
+%T Operating system benchmarking in the wake of lmbench: a case study of the performance of NetBSD on the Intel x86 architecture
+%B Proceedings of the 1997 ACM SIGMETRICS Conference on Measurement and Modeling of Computer Systems
+%C Seattle, WA
+%D June 1997
+%P 214-224
+%o http://www.eecs.harvard.edu/~vino/perf/hbench/sigmetrics/hbench.html
+
+%z Article
+%A Cristina Hristea
+%A Danial Lenoski
+%A John Keen
+%T Measuring memory hierarchy performance of cache-coherent multiprocessors using microbenchmarks
+%B Proceedings of Supercomputing '97
+%D November 1997
+%C San Jose, CA
+%o http://www.supercomp.org/sc97/proceedings/TECH/HRISTEA/
+
+%z Thesis
+%K Prestor01
+%A Uros Prestor
+%T Evaluating the memory performance of a ccNUMA system
+%I Department of Computer Science, University of Utah
+%D May 2001
+
+%z Thesis
+%K Saavedra92
+%A Rafael H. Saavedra-Barrera
+%T CPU Performance evaluation and execution time prediction using narrow spectrum benchmarking
+%I Department of Computer Science, University of California at Berkeley
+%D 1992
+
+%z Article
+%K Saavedra95
+%A R.H. Saavedra
+%A A.J. Smith
+%T Measuring cache and TLB performance and their effect on benchmark runtimes
+%J IEEE Transactions on Computers
+%V 44
+%N 10
+%D October 1995
+%P 1223-1235
+
+%z Article
+%K Wolman89
+%A Barry L. Wolman
+%A Thomas M. Olson
+%T IOBENCH: a system independent IO benchmark
+%J Computer Architecture News
+%V 17
+%N 5
+%D September 1989
+%P 55-70
+%x IOBENCH is an operating system and processor independent synthetic
+%x input/output (IO) benchmark designed to put a configurable IO and
+%x processor (CP) load on the system under test.  This paper discusses
+%x the UNIX versions.
+%k IOBENCH, synthetic I/O benchmark, UNIX workload
+%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991)
+
+%z Book
+%K Hennessy96
+%A John L. Hennessy
+%A David A. Patterson
+%T Computer Architecture A Quantitative Approach, 2nd Edition
+%I Morgan Kaufman
+%D 1996
+
+%z Book
+%K Jain91
+%A Raj Jain
+%T The Art of Computer Systems Performance Analysis: Techniques for Experimental Design, Measurement, Simulation, and Modeling
+%I Wiley-Interscience
+%C New York, NY
+%D April 1991
+
+%z Article
+%K Chen94a
+%A P. M. Chen
+%A D. A. Patterson
+%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance
+%D November 1994
+%J Transactions on Computer Systems
+%V 12
+%N 4
+%P 308-339
+%x Current I/O benchmarks suffer from several chronic problems: they
+%x quickly become obsolete; they do not stress the I/O system; and they
+%x do not help much in undelsi;anding I/O system performance. We
+%x propose a new approach to I/O performance analysis. First, we
+%x propose a self-scaling benchmark that dynamically adjusts aspects of
+%x its workload according to the performance characteristic of the
+%x system being measured. By doing so, the benchmark automatically
+%x scales across current and future systems. The evaluation aids in
+%x understanding system performance by reporting how performance varies
+%x according to each of five workload parameters. Second, we propose
+%x predicted performance, a technique for using the results from the
+%x self-scaling evaluation to estimate quickly the performance for
+%x workloads that have not been measured. We show that this technique
+%x yields reasonably accurate performance estimates and argue that this
+%x method gives a far more accurate comparative performance evaluation
+%x than traditional single-point benchmarks. We apply our new
+%x evaluation technique by measuring a SPARCstation 1+ with one SCSI
+%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running
+%x the Sprite LFS operating system with a three-disk disk array, a
+%x Convex C240 minisupercomputer with a four-disk disk array, and a
+%x Solbourne 5E/905 fileserver with a two-disk disk array.
+%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995)
+%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995)
+%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995)
+
+%z InProceedings
+%K Ousterhout90
+%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990)
+%A John K. Ousterhout
+%T Why aren't operating systems getting faster as fast as hardware?
+%B Proceedings USENIX Summer Conference
+%C Anaheim, CA
+%D June 1990
+%P 247-256
+%x This paper evaluates several hardware pplatforms and operating systems using
+%x a set of benchmarks that stress kernel entry/exit, file systems, and
+%x other things related to operating systems. The overall conclusion is that
+%x operating system performance is not improving at the same rate as the base speed of the
+%x underlying hardware. The most obvious ways to remedy this situation
+%x are to improve memory bandwidth and reduce operating systems'
+%x tendency to wait for disk operations to complete.
+%o Typical performance of 10-20 MIPS cpus is only 0.4 times what
+%o their raw hardware performance would suggest. HP-UX is
+%o particularly bad on the HP 9000/835, at about 0.2x. (Although
+%o this measurement discounted a highly-tuned getpid call.)
+%k OS performance, RISC machines, HP9000 Series 835 system calls
+
+%z InProceedings
+%K McVoy91
+%A L. W. McVoy
+%A S. R. Kleiman
+%T Extent-like Performance from a Unix File System
+%B Proceedings USENIX Winter Conference
+%C Dallas, TX
+%D January 1991
+%P 33-43
+
+%z Article
+%K Chen93d
+%A Peter M. Chen
+%A David Patterson
+%T Storage performance \- metrics and benchmarks
+%J Proceedings of the IEEE
+%V 81
+%N 8
+%D August 1993
+%P 1151-1165
+%x Discusses metrics and benchmarks used in storage performance evaluation.
+%x Describes, reviews, and runs popular I/O benchmarks on three systems.  Also
+%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling
+%x benchmark with predicted performance.
+%k I/O, storage, benchmark, workload, self-scaling benchmark, 
+%k predicted performance, disk, performance evaluation
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995)
+
+%z Article
+%K Park90a
+%A Arvin Park
+%A J. C. Becker
+%T IOStone: a synthetic file system benchmark
+%J Computer Architecture News
+%V 18
+%N 2
+%D June 1990
+%P 45-52
+%o this benchmark is useless for all modern systems; it fits
+%o completely inside the file system buffer cache.  Soon it may even
+%o fit inside the processor cache!
+%k IOStone, I/O, benchmarks
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995)
+
+%z Article
+%K Fenwick95
+%A David M. Fenwick
+%A Denis J. Foley
+%A William B. Gist
+%A Stephen R. VanDoren
+%A Danial Wissell
+%T The AlphaServer 8000 series: high-end server platform development
+%J Digital Technical Journal
+%V 7
+%N 1
+%D August 1995
+%P 43-65
+%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end
+%x server products.  Both servers are based on the 300Mhz Alpha 21164 
+%x microprocessor and on the AlphaServer 8000-series platform architecture.
+%x The AlphaServer 8000 platform development team set aggressive system data
+%x bandwidth and memory read latency targets in order to achieve high-performance
+%x goals.  The low-latency criterion was factored into design decisions made at
+%x each of the seven layers of platform development.  The combination of 
+%x industry-leading microprocessor technology and a system platform focused
+%x on low latency has resulted in a 12-processor server implementation ---
+%x the AlphaServer 8400 --- capable of supercomputer levels of performance.
+%k DEC Alpha server, performance, memory latency
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995)
+
+%z Book
+%K Toshiba94
+%Q Toshiba
+%T DRAM Components and Modules
+%I Toshiba America Electronic Components, Inc.
+%P A59-A77,C37-C42
+%D 1994
+
+%z Article
+%K McCalpin95
+%A John D. McCalpin
+%T Memory bandwidth and machine balance in current high performance computers
+%J IEEE Technical Committee on Computer Architecture newsletter
+%D December 1995
+
+%z Article
+%K McCalpin02
+%A John D. McCalpin
+%T The STREAM2 home page
+%o http://www.cs.virginia.edu/stream/stream2/
+%D 2002
+
+%z Article
+%K FSF89
+%A Richard Stallman
+%Q Free Software Foundation
+%T General Public License
+%D 1989
+%O Included with \*[lmbench]
+
+%z Article
+%K Shein89
+%A Barry Shein
+%A Mike Callahan
+%A Paul Woodbury
+%T NFSSTONE: A network file server performance benchmark
+%B Proceedings USENIX Summer Conference
+%C Baltimore, MD
+%D June 1989
+%P 269-275
+
+%z Article
+%K Weicker84
+%A R.P. Weicker
+%T Dhrystone: A synthetic systems programming benchmark
+%J Communications of the ACM
+%V 27
+%N 10
+%P 1013--1030
+%D 1984
+
+%z Article
+%K Howard88
+%A J. Howard
+%A M. Kazar
+%A S. Menees
+%A S. Nichols
+%A M. Satyanrayanan
+%A R. Sidebotham
+%A M. West
+%T Scale and performance in a distributed system
+%J ACM Transactions on Computer Systems
+%V 6
+%N 1
+%D February 1988
+%P 51--81
+%k Andrew benchmark
+
+%z Article
+%K Banga97
+%A Guarav Banga
+%A Peter Druschel
+%T Measuring the capacity of a web server
+%B Proceedings USENIX Symposium on Internet Technologies and Systems
+%C Monterey, CA
+%D December 1997
+%P 61--71
+
+%z Article
+%K Banga98
+%A Guarav Banga
+%A Jeffrey C. Mogul
+%T Scalable kernel performance for internet servers under realistic loads
+%B Proceedings of the 1998 USENIX Annual Technical Conference
+%C New Orleans, LA
+%D June 1998
+%P 69--83
+
+%z Article
+%K Mogul99
+%A Jeffrey C. Mogul
+%T Brittle metrics in operating systems research
+%B Proceedings 7th IEEE Workshop on Hot Topics in Operating Systems (HotOS-VII)
+%C Rio Rico, AZ
+%P 90--95
+%D March 1999
+
+%z Article
+%K Regehr2002
+%A John Regehr
+%T Inferring scheduling behavior with Hourglass
+%B Proceedings of the USENIX Annual Technical Conference FREENIX track
+%C Monterey, CA
+%D June 2002
+%P 143--156
+
+%z Article
+%K Seltzer99
+%A Margo Seltzer
+%A David Krinsky
+%A Keith Smith
+%A Xiolan Zhang
+%T The case for application-specific benchmarking
+%B Proceedings of the 1999 Workshop on Hot Topics in Operating Systems
+%C Rico, AZ
+%D 1999
+%P 102--107
+
+%z Article
+%K Smith97
+%A Keith A. Smith
+%A Margo L. Seltzer
+%T File system aging --- Increasing the relevance of file system benchmarks
+%B Proceedings of the 1997 SIGMETRICS Conference
+%D June 1997
+%C Seattle, WA
+%P 203-213
+
+%z Article
+%K Tullsen96
+%A Dean Tullsen
+%A Susan Eggers
+%A Joel Emer
+%A Henry Levy
+%A Jack Lo
+%A Rebecca Stamm
+%T Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor
+%C Proceedings of the 23rd Annual International Symposium on Computer Architecture
+%D May 1996
+%P 191-202
+%O http://www.cs.washington.edu/research/smt/papers/ISCA96.ps
+
+%z Article
+%K Tullsen99
+%A Dean Tullsen
+%A Jack Lo
+%A Susan Eggers
+%A Henry Levy
+%T Supporting fine-grain synchronization on a simultaneous multithreaded processor
+%B Proceedings of the 5th International Symposium on High Performance Computer Architecture
+%D January 1999
+%P 54-58
+%O http://www.cs.washington.edu/research/smt/papers/hpca.ps
+
+%z Report
+%K Whaley97
+%A R. Clint Whaley
+%A Jack Dongarra
+%T Automatically tuned linear algebra software
+%I Department of Computer Science, University of Tennessee
+%C Knoxville, TN
+%R UT-CS-97-366
+%D 1997
+%o http://math-atlas.sourceforge.net/
+
+%z Article
+%K SPEChpc96
+%Q Standard Performance Evaluation Corporation
+%T SPEC HPC96 benchmark
+%D 1996
+%O http://www.specbench.org/hpg/hpc96/
+
+%z Article
+%K Parkbench
+%Q PARallel Kernels and BENCHmarks committee
+%T PARKBENCH
+%D 2002
+%O http://www.netlib.org/parkbench/
+
+%z Article
+%K NAS
+%Q NASA Advanced Supercomputing Division, NASA Ames Research Center
+%T NAS parallel benchmarks
+%O http://www.nas.nasa.gov/NAS/NPB
+
+%z Article
+%K Glendinning94
+%A Ian Glendinning
+%T GENESIS distributed memory benchmark suite
+%O http://wotug.ukc.ac.uk/parallel/performance/benchmarks/genesis
+%D 1994
+
+%z Article
+%K Intel99
+%Q Intel
+%T Profusion --- An 8-way symmetric multiprocessing chipset
+%O http://netserver.hp.com/docs/download.asp?file=tp_profusion(r).pdf
+%D July 1999
diff --git a/performance/lmbench3/doc/references-memhier b/performance/lmbench3/doc/references-memhier
new file mode 100755
index 0000000..59306b6
--- /dev/null
+++ b/performance/lmbench3/doc/references-memhier
@@ -0,0 +1,251 @@
+%z Article
+%K Staelin02b
+%A Carl Staelin
+%T lmbench3: Measuring scalability
+%D November 2002
+%I Hewlett-Packard Laboratories
+%C Palo Alto, CA
+
+%z Article
+%K Staelin02c
+%A Carl Staelin
+%T Utilizing intra-processor parallelism
+%D December 2002
+%I Hewlett-Packard Laboratories
+%C Palo Alto, CA
+
+%z Article
+%K Whaley98
+%A R. Clint Whaley
+%A Jack Dongarra
+%T Automatically tuned linear algebra software
+%C Proceedings of the 1998 ACM/IEEE SC98 Conference
+%D 1998
+%O http://sourceforge.net/projects/math-atlas
+
+%z Article
+%K Staelin98
+%A Carl Staelin
+%A Larry McVoy
+%T mhz: Anatomy of a microbenchmark
+%C Proceedings USENIX Annual Technical Conference
+%c New Orleans, LA
+%D June 1998
+%P 155-166
+
+%z Article
+%K McVoy96
+%A Larry McVoy
+%A Carl Staelin
+%T lmbench: Portable tools for performance analysis
+%C Proceedings USENIX Winter Conference
+%c San Diego, CA
+%D January 1996
+%P 279-284
+
+%a Thesis
+%K Prestor01
+%A Uros Prestor
+%T Evaluating the memory performance of a ccNUMA system
+%R Masters Thesis
+%I School of Computing, University of Utah
+%c Salt Lake City, Utah
+%D May 2001
+%O http://www.cs.utah.edu/~uros/thesis/thesis.pdf
+
+%z Article
+%K Saavedra95
+%A R.H. Saavedra
+%A A.J. Smith
+%T Measuring cache and TLB performance and their effect on benchmark runtimes
+%J IEEE Transactions on Computers
+%V 44
+%N 10
+%D October 1995
+%P 1223-1235
+
+%z Article
+%K Wolman89
+%A Barry L. Wolman
+%A Thomas M. Olson
+%T IOBENCH: a system independent IO benchmark
+%J Computer Architecture News
+%V 17
+%N 5
+%D September 1989
+%P 55-70
+%x IOBENCH is an operating system and processor independent synthetic
+%x input/output (IO) benchmark designed to put a configurable IO and
+%x processor (CP) load on the system under test.  This paper discusses
+%x the UNIX versions.
+%k IOBENCH, synthetic I/O benchmark, UNIX workload
+%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991)
+
+%z Book
+%K Hennessy96
+%A John L. Hennessy
+%A David A. Patterson
+%T Computer Architecture A Quantitative Approach, 2nd Edition
+%I Morgan Kaufman
+%D 1996
+
+%z Article
+%K Chen94a
+%A P. M. Chen
+%A D. A. Patterson
+%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance
+%D November 1994
+%J Transactions on Computer Systems
+%V 12
+%N 4
+%P 308-339
+%x Current I/O benchmarks suffer from several chronic problems: they
+%x quickly become obsolete; they do not stress the I/O system; and they
+%x do not help much in undelsi;anding I/O system performance. We
+%x propose a new approach to I/O performance analysis. First, we
+%x propose a self-scaling benchmark that dynamically adjusts aspects of
+%x its workload according to the performance characteristic of the
+%x system being measured. By doing so, the benchmark automatically
+%x scales across current and future systems. The evaluation aids in
+%x understanding system performance by reporting how performance varies
+%x according to each of five workload parameters. Second, we propose
+%x predicted performance, a technique for using the results from the
+%x self-scaling evaluation to estimate quickly the performance for
+%x workloads that have not been measured. We show that this technique
+%x yields reasonably accurate performance estimates and argue that this
+%x method gives a far more accurate comparative performance evaluation
+%x than traditional single-point benchmarks. We apply our new
+%x evaluation technique by measuring a SPARCstation 1+ with one SCSI
+%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running
+%x the Sprite LFS operating system with a three-disk disk array, a
+%x Convex C240 minisupercomputer with a four-disk disk array, and a
+%x Solbourne 5E/905 fileserver with a two-disk disk array.
+%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995)
+%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995)
+%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995)
+
+%z InProceedings
+%K Ousterhout90
+%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990)
+%A John K. Ousterhout
+%T Why aren't operating systems getting faster as fast as hardware?
+%C Proceedings USENIX Summer Conference
+%c Anaheim, CA
+%D June 1990
+%P 247-256
+%x This paper evaluates several hardware pplatforms and operating systems using
+%x a set of benchmarks that stress kernel entry/exit, file systems, and
+%x other things related to operating systems. The overall conclusion is that
+%x operating system performance is not improving at the same rate as the base speed of the
+%x underlying hardware. The most obvious ways to remedy this situation
+%x are to improve memory bandwidth and reduce operating systems'
+%x tendency to wait for disk operations to complete.
+%o Typical performance of 10-20 MIPS cpus is only 0.4 times what
+%o their raw hardware performance would suggest. HP-UX is
+%o particularly bad on the HP 9000/835, at about 0.2x. (Although
+%o this measurement discounted a highly-tuned getpid call.)
+%k OS performance, RISC machines, HP9000 Series 835 system calls
+
+%z InProceedings
+%K McVoy91
+%A L. W. McVoy
+%A S. R. Kleiman
+%T Extent-like Performance from a Unix File System
+%C Proceedings USENIX Winter Conference
+%c Dallas, TX
+%D January 1991
+%P 33-43
+
+%z Article
+%K Chen93d
+%A Peter M. Chen
+%A David Patterson
+%T Storage performance \- metrics and benchmarks
+%J Proceedings of the IEEE
+%V 81
+%N 8
+%D August 1993
+%P 1151-1165
+%x Discusses metrics and benchmarks used in storage performance evaluation.
+%x Describes, reviews, and runs popular I/O benchmarks on three systems.  Also
+%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling
+%x benchmark with predicted performance.
+%k I/O, storage, benchmark, workload, self-scaling benchmark, 
+%k predicted performance, disk, performance evaluation
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995)
+
+%z Article
+%K Park90a
+%A Arvin Park
+%A J. C. Becker
+%T IOStone: a synthetic file system benchmark
+%J Computer Architecture News
+%V 18
+%N 2
+%D June 1990
+%P 45-52
+%o this benchmark is useless for all modern systems; it fits
+%o completely inside the file system buffer cache.  Soon it may even
+%o fit inside the processor cache!
+%k IOStone, I/O, benchmarks
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995)
+
+%z Article
+%K Fenwick95
+%A David M. Fenwick
+%A Denis J. Foley
+%A William B. Gist
+%A Stephen R. VanDoren
+%A Danial Wissell
+%T The AlphaServer 8000 series: high-end server platform development
+%J Digital Technical Journal
+%V 7
+%N 1
+%D August 1995
+%P 43-65
+%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end
+%x server products.  Both servers are based on the 300Mhz Alpha 21164 
+%x microprocessor and on the AlphaServer 8000-series platform architecture.
+%x The AlphaServer 8000 platform development team set aggressive system data
+%x bandwidth and memory read latency targets in order to achieve high-performance
+%x goals.  The low-latency criterion was factored into design decisions made at
+%x each of the seven layers of platform development.  The combination of 
+%x industry-leading microprocessor technology and a system platform focused
+%x on low latency has resulted in a 12-processor server implementation ---
+%x the AlphaServer 8400 --- capable of supercomputer levels of performance.
+%k DEC Alpha server, performance, memory latency
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995)
+
+%z Book
+%K Toshiba94
+%A Toshiba
+%T DRAM Components and Modules
+%I Toshiba America Electronic Components, Inc.
+%P A59-A77,C37-C42
+%D 1994
+
+%z Article
+%K McCalpin95
+%A John D. McCalpin
+%T Memory bandwidth and machine balance in current high performance computers
+%J IEEE Technical Committee on Computer Architecture newsletter
+%D December 1995
+
+%z Article
+%K FSF89
+%A Richard Stallman
+%Q Free Software Foundation
+%T General Public License
+%D 1989
+%O Included with \*[lmbench]
+
+%z Article
+%K Min01
+%A Rui Min
+%A Yiming Hu
+%T Improving performance of large physically indexed caches by decoupling memory addresses from cache addresses
+%J IEEE Transactions on Computers
+%V 50
+%N 11
+%D November 2001
+%P 1191-1201
diff --git a/performance/lmbench3/doc/references-parallel b/performance/lmbench3/doc/references-parallel
new file mode 100644
index 0000000..869f794
--- /dev/null
+++ b/performance/lmbench3/doc/references-parallel
@@ -0,0 +1,171 @@
+%z Article
+%K Tullsen96
+%A Dean Tullsen
+%A Susan Eggers
+%A Joel Emer
+%A Henry Levy
+%A Jack Lo
+%A Rebecca Stamm
+%T Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor
+%C Proceedings of the 23rd Annual International Symposium on Computer Architecture
+%D May 1996
+%P 191-202
+%O http://www.cs.washington.edu/research/smt/papers/ISCA96.ps
+
+%z Article
+%K Tullsen99
+%A Dean Tullsen
+%A Jack Lo
+%A Susan Eggers
+%A Henry Levy
+%T Suppoerting fine-grain synchronization on a simultaneous multithreaded processor
+%C Proceedings of the 5th International Symposium on High Performance Computer Architecture
+%D January 1999
+%P 54-58
+%O http://www.cs.washington.edu/research/smt/papers/hpca.ps
+
+%z Article
+%K Kumar97
+%A A. Kumar
+%T The HP PA-8000 RISC CPU
+%J IEEE Micro
+%V 17
+%N 2
+%D March-April 1997
+%P 27-32
+
+%z Article
+%K Schlansker00
+%A M.S. Schlansker
+%A B.R. Rau
+%T EPIC: Explicitly parallel instruction computing
+%J IEEE Computer
+%V 33
+%N 2
+%D Feb. 2000
+%P 37-45
+
+%z Article
+%K Smith95
+%A James E. Smith
+%A Gurindar S. Sohi
+%T The microarchitecture of superscalar processors
+%J Proceedings of the IEEE
+%V 83
+%D October 1995
+%P 1609-1624
+
+%z Thesis
+%K Munoz97
+%A Raul E. Silvera Munoz
+%T Static instruction scheduling for dynamic issue processors
+%I ACAPS Laboratory, School of Computer Science, McGill University
+%D 1997
+
+%z Article
+%K Agarwal96
+%A Ramesh K. Agarwal
+%T A super scalar sort algorithm for RISC processors
+%C Processings 1996 ACM SIGMOD International Conference on Management of Data
+%D 1996
+%P 240-246
+%O http://citeseer.nj.nec.com/agarwal96super.html
+
+%z Article
+%K Staelin01a
+%A Carl Staelin
+%T Analyzing the memory hierarchy
+%D October 2001
+%I Hewlett-Packard Laboratories
+%C Palo Alto, CA
+
+%z Article
+%K Staelin01b
+%A Carl Staelin
+%T lmbench3: Measuring scalability
+%D October 2001
+%I Hewlett-Packard Laboratories
+%C Palo Alto, CA
+
+%z Article
+%K Frigo98
+%A M. Frigo
+%A S.G. Johnson
+%T FFTW: An adaptive software architecture for the FFT
+%C Proceedings 1998 ICASSP
+%V 3
+%P 1381-1384
+%O http://www.fftw.org/fftw-paper-icassp.pdf
+
+%z Article
+%K Whaley98
+%A R. Clint Whaley
+%A Jack Dongarra
+%T Automatically tuned linear algebra software
+%C Proceedings of the 1998 ACM/IEEE SC98 Conference
+%D 1998
+%O http://sourceforge.net/projects/math-atlas
+
+%z Article
+%K Staelin98
+%A Carl Staelin
+%A Larry McVoy
+%T mhz: Anatomy of a microbenchmark
+%C Proceedings USENIX Annual Technical Conference
+%c New Orleans, LA
+%D June 1998
+%P 155-166
+
+%z Article
+%K McVoy96
+%A Larry McVoy
+%A Carl Staelin
+%T lmbench: Portable tools for performance analysis
+%C Proceedings USENIX Winter Conference
+%c San Diego, CA
+%D January 1996
+%P 279-284
+
+%z Thesis
+%K Prestor01
+%A Uros Prestor
+%T Evaluating the memory performance of a ccNUMA system
+%R Masters Thesis
+%I School of Computing, University of Utah
+%C Salt Lake City, Utah
+%D May 2001
+%O http://www.cs.utah.edu/~uros/thesis/thesis.pdf
+
+%z Article
+%K Saavedra95
+%A R.H. Saavedra
+%A A.J. Smith
+%T Measuring cache and TLB performance and their effect on benchmark runtimes
+%J IEEE Transactions on Computers
+%V 44
+%N 10
+%D October 1995
+%P 1223-1235
+
+%z Book
+%K Knuth73
+%A Donald E. Knuth
+%T The Art of computer programming, 2nd Edition
+%I Addison-Wesley
+%D 1973
+
+%z Book
+%K Hennessy96
+%A John L. Hennessy
+%A David A. Patterson
+%T Computer Architecture A Quantitative Approach, 2nd Edition
+%I Morgan Kaufman
+%D 1996
+
+
+%z Article
+%K McCalpin95
+%A John D. McCalpin
+%T Memory bandwidth and machine balance in current high performance computers
+%J IEEE Technical Committee on Computer Architecture newsletter
+%D December 1995
diff --git a/performance/lmbench3/doc/references-userguide b/performance/lmbench3/doc/references-userguide
new file mode 100644
index 0000000..f6fea3d
--- /dev/null
+++ b/performance/lmbench3/doc/references-userguide
@@ -0,0 +1,338 @@
+%z Article
+%K Banga97
+%A Guarav Banga
+%A Peter Druschel
+%T Measuring the capacity of a web server
+%B Proceedings USENIX Symposium on Internet Technologies and Systems
+%C Monterey, CA
+%D December 1997
+
+%z Article
+%K Banga98
+%A Guarav Banga
+%A Jeffrey C. Mogul
+%T Scalable kernel performance for internet servers under realistic loads
+%B Proceedings of the 1998 USENIX Annual Technical Conference
+%C New Orleans, LA
+%D June 1998
+
+%K Bray90
+%A Tim Bray
+%T Bonnie benchmark
+%D 1990
+%O http://www.textuality.com/bonnie/
+
+%z Article
+%K Brown97
+%A Aaron Brown
+%A Margo Seltzer
+%T Operating system benchmarking in the wake of lmbench: A case study of the performance of NetBSD on the Intel x86 architecture
+%B Proceedings of the 1997 ACM SIGMETRICS Conference on Measurement and Modeling of Computer Systems
+%C Seattle, WA
+%D June 1997
+%P 214-224
+%O http://www.eecs.harvard.edu/~vino/perf/hbench/sigmetrics/hbench.html
+
+%z Article
+%K Chen93d
+%A Peter M. Chen
+%A David Patterson
+%T Storage performance \- metrics and benchmarks
+%J Proceedings of the IEEE
+%V 81
+%N 8
+%D August 1993
+%P 1151-1165
+%x Discusses metrics and benchmarks used in storage performance evaluation.
+%x Describes, reviews, and runs popular I/O benchmarks on three systems.  Also
+%x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling
+%x benchmark with predicted performance.
+%k I/O, storage, benchmark, workload, self-scaling benchmark, 
+%k predicted performance, disk, performance evaluation
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:21:11 PDT 1995)
+
+%z Article
+%K Chen94a
+%A P. M. Chen
+%A D. A. Patterson
+%T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance
+%D November 1994
+%J Transactions on Computer Systems
+%V 12
+%N 4
+%P 308-339
+%x Current I/O benchmarks suffer from several chronic problems: they
+%x quickly become obsolete; they do not stress the I/O system; and they
+%x do not help much in undelsi;anding I/O system performance. We
+%x propose a new approach to I/O performance analysis. First, we
+%x propose a self-scaling benchmark that dynamically adjusts aspects of
+%x its workload according to the performance characteristic of the
+%x system being measured. By doing so, the benchmark automatically
+%x scales across current and future systems. The evaluation aids in
+%x understanding system performance by reporting how performance varies
+%x according to each of five workload parameters. Second, we propose
+%x predicted performance, a technique for using the results from the
+%x self-scaling evaluation to estimate quickly the performance for
+%x workloads that have not been measured. We show that this technique
+%x yields reasonably accurate performance estimates and argue that this
+%x method gives a far more accurate comparative performance evaluation
+%x than traditional single-point benchmarks. We apply our new
+%x evaluation technique by measuring a SPARCstation 1+ with one SCSI
+%x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running
+%x the Sprite LFS operating system with a three-disk disk array, a
+%x Convex C240 minisupercomputer with a four-disk disk array, and a
+%x Solbourne 5E/905 fileserver with a two-disk disk array.
+%s toc@xxxxxxxxxx (Mon Mar 13 10:57:38 1995)
+%s wilkes%hplajw@xxxxxxxxxx (Sun Mar 19 12:38:01 PST 1995)
+%s wilkes%cello@xxxxxxxxxx (Sun Mar 19 12:38:53 PST 1995)
+
+%z Article
+%K Fenwick95
+%A David M. Fenwick
+%A Denis J. Foley
+%A William B. Gist
+%A Stephen R. VanDoren
+%A Danial Wissell
+%T The AlphaServer 8000 series: high-end server platform development
+%J Digital Technical Journal
+%V 7
+%N 1
+%D August 1995
+%P 43-65
+%x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end
+%x server products.  Both servers are based on the 300Mhz Alpha 21164 
+%x microprocessor and on the AlphaServer 8000-series platform architecture.
+%x The AlphaServer 8000 platform development team set aggressive system data
+%x bandwidth and memory read latency targets in order to achieve high-performance
+%x goals.  The low-latency criterion was factored into design decisions made at
+%x each of the seven layers of platform development.  The combination of 
+%x industry-leading microprocessor technology and a system platform focused
+%x on low latency has resulted in a 12-processor server implementation ---
+%x the AlphaServer 8400 --- capable of supercomputer levels of performance.
+%k DEC Alpha server, performance, memory latency
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 17:27:23 PDT 1995)
+
+%z Book
+%K Hennessy96
+%A John L. Hennessy
+%A David A. Patterson
+%T Computer Architecture A Quantitative Approach, 2nd Edition
+%I Morgan Kaufman
+%D 1996
+
+%z Article
+%K Howard88
+%A J. Howard
+%A M. Kazar
+%A S. Menees
+%A S. Nichols
+%A M. Satyanrayanan
+%A R. Sidebotham
+%A M. West
+%T Scale and performance in a distributed system
+%J ACM Transactions on Computer Systems
+%V 6
+%N 1
+%D February 1988
+%P 51-81
+%k Andrew benchmark
+
+%z Book
+%K Jain91
+%A Raj Jain
+%T The Art of Computer Systems Performance Analysis: Techniques for Experimental Design, Measurement, Simulation, and Modeling
+%I Wiley-Interscience
+%C New York, NY
+%D April 1991
+
+%z Article
+%K McCalpin95
+%A John D. McCalpin
+%T Memory bandwidth and machine balance in current high performance computers
+%J IEEE Technical Committee on Computer Architecture newsletter
+%D December 1995
+
+%z InProceedings
+%K McVoy91
+%A L. W. McVoy
+%A S. R. Kleiman
+%T Extent-like Performance from a Unix File System
+%B Proceedings USENIX Winter Conference
+%C Dallas, TX
+%D January 1991
+%P 33-43
+
+%z Article
+%K McVoy96
+%A Larry McVoy
+%A Carl Staelin
+%T lmbench: Portable tools for performance analysis
+%B Proceedings USENIX Winter Conference
+%C San Diego, CA
+%D January 1996
+%P 279-284
+
+%z InProceedings
+%K Ousterhout90
+%s wilkes%cello@xxxxxxxxxxxxx (Fri Jun 29 20:46:08 PDT 1990)
+%A John K. Ousterhout
+%T Why aren't operating systems getting faster as fast as hardware?
+%B Proceedings USENIX Summer Conference
+%C Anaheim, CA
+%D June 1990
+%P 247-256
+%x This paper evaluates several hardware pplatforms and operating systems using
+%x a set of benchmarks that stress kernel entry/exit, file systems, and
+%x other things related to operating systems. The overall conclusion is that
+%x operating system performance is not improving at the same rate as the base speed of the
+%x underlying hardware. The most obvious ways to remedy this situation
+%x are to improve memory bandwidth and reduce operating systems'
+%x tendency to wait for disk operations to complete.
+%o Typical performance of 10-20 MIPS cpus is only 0.4 times what
+%o their raw hardware performance would suggest. HP-UX is
+%o particularly bad on the HP 9000/835, at about 0.2x. (Although
+%o this measurement discounted a highly-tuned getpid call.)
+%k OS performance, RISC machines, HP9000 Series 835 system calls
+
+%z Article
+%K Park90a
+%A Arvin Park
+%A J. C. Becker
+%T IOStone: a synthetic file system benchmark
+%J Computer Architecture News
+%V 18
+%N 2
+%D June 1990
+%P 45-52
+%o this benchmark is useless for all modern systems; it fits
+%o completely inside the file system buffer cache.  Soon it may even
+%o fit inside the processor cache!
+%k IOStone, I/O, benchmarks
+%s staelin%cello@xxxxxxxxxx (Wed Sep 27 16:37:26 PDT 1995)
+
+%z Thesis
+%K Prestor01
+%A Uros Prestor
+%T Evaluating the memory performance of a ccNUMA system
+%I Department of Computer Science, University of Utah
+%D May 2001
+
+%z Thesis
+%K Saavedra92
+%A Rafael H. Saavedra-Barrera
+%T CPU Performance evaluation and execution time prediction using narrow spectrum benchmarking
+%I Department of Computer Science, University of California at Berkeley
+%D 1992
+
+%z Article
+%K Saavedra95
+%A R.H. Saavedra
+%A A.J. Smith
+%T Measuring cache and TLB performance and their effect on benchmark runtimes
+%J IEEE Transactions on Computers
+%V 44
+%N 10
+%D October 1995
+%P 1223-1235
+
+%z Article
+%k Seltzer99
+%A Margo Seltzer
+%A David Krinsky
+%A Keith Smith
+%A Xiolan Zhang
+%T The case for application-specific benchmarking
+%B Proceedings of the 1999 Workshop on Hot Topics in Operating Systems
+%C Rico, AZ
+%D 1999
+
+%z Article
+%K Shein89
+%A Barry Shein
+%A Mike Callahan
+%A Paul Woodbury
+%T NFSSTONE: A network file server performance benchmark
+%B Proceedings USENIX Summer Conference
+%C Baltimore, MD
+%D June 1989
+%P 269-275
+
+%z Article
+%K Staelin98
+%A Carl Staelin
+%A Larry McVoy
+%T mhz: Anatomy of a microbenchmark
+%B Proceedings USENIX Annual Technical Conference
+%C New Orleans, LA
+%D June 1998
+%P 155-166
+
+%z Article
+%K FSF89
+%A Richard Stallman
+%Q Free Software Foundation
+%T General Public License
+%D 1989
+%O Included with \*[lmbench]
+
+%z Book
+%K Toshiba94
+%A Toshiba
+%T DRAM Components and Modules
+%I Toshiba America Electronic Components, Inc.
+%P A59-A77,C37-C42
+%D 1994
+
+%z Article
+%K Tullsen96
+%A Dean Tullsen
+%A Susan Eggers
+%A Joel Emer
+%A Henry Levy
+%A Jack Lo
+%A Rebecca Stamm
+%T Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor
+%C Proceedings of the 23rd Annual International Symposium on Computer Architecture
+%D May 1996
+%P 191-202
+%O http://www.cs.washington.edu/research/smt/papers/ISCA96.ps
+
+%z Article
+%K Tullsen99
+%A Dean Tullsen
+%A Jack Lo
+%A Susan Eggers
+%A Henry Levy
+%T Supporting fine-grain synchronization on a simultaneous multithreaded processor
+%B Proceedings of the 5th International Symposium on High Performance Computer Architecture
+%D January 1999
+%P 54-58
+%O http://www.cs.washington.edu/research/smt/papers/hpca.ps
+
+%z Article
+%K Weicker84
+%A R.P. Weicker
+%T Dhrystone: A synthetic systems programming benchmark
+%J CACM
+%V 27
+%N 10
+%P 1013-1030
+%D 1984
+
+%z Article
+%K Wolman89
+%A Barry L. Wolman
+%A Thomas M. Olson
+%T IOBENCH: a system independent IO benchmark
+%J Computer Architecture News
+%V 17
+%N 5
+%D September 1989
+%P 55-70
+%x IOBENCH is an operating system and processor independent synthetic
+%x input/output (IO) benchmark designed to put a configurable IO and
+%x processor (CP) load on the system under test.  This paper discusses
+%x the UNIX versions.
+%k IOBENCH, synthetic I/O benchmark, UNIX workload
+%s vinton%cello@xxxxxxxxxxxxx (Fri Sep 20 12:55:58 PDT 1991)
+
diff --git a/performance/lmbench3/doc/references.private b/performance/lmbench3/doc/references.private
new file mode 100644
index 0000000..7394354
--- /dev/null
+++ b/performance/lmbench3/doc/references.private
@@ -0,0 +1,7 @@
+%z Article
+%K McCalpin95
+%A John D. McCalpin
+%T Memory bandwidth and machine balance in current high performance computers
+%J IEEE Technical Committee on Computer Architecture newsletter
+%V to appear 
+%D Dec. 1995
diff --git a/performance/lmbench3/doc/reporting.3 b/performance/lmbench3/doc/reporting.3
new file mode 100644
index 0000000..e63124a
--- /dev/null
+++ b/performance/lmbench3/doc/reporting.3
@@ -0,0 +1,71 @@
+.\"
+.\" @(#)lmbench.man	2.0 98/04/24
+.\"
+.\"   lmbench - benchmarking toolbox
+.\"
+.\"   Copyright (C) 1998  Carl Staelin and Larry McVoy
+.\"   E-mail: staelin@xxxxxxxxxx
+.\"
+.TH "lmbench reporting" 3 "$Date:" "(c)1998-2000 Larry McVoy and Carl Staelin" "LMBENCH"
+.SH "NAME"
+milli, micro, nano, mb, kb \- the lmbench reporting subsystem
+.SH "SYNOPSIS"
+.B "#include ``lmbench.h''"
+.LP
+.B "void	milli(char *s, uint64 n)"
+.LP
+.B "void	micro(char *s, uint64 n)"
+.LP
+.B "void	nano(char *s, uint64 n)"
+.LP
+.B "void	mb(uint64 bytes)"
+.LP
+.B "void	kb(uint64 bytes)"
+.SH "DESCRIPTION"
+Creating benchmarks using the 
+.I lmbench 
+timing harness is easy.
+Since it is so easy to measure performance using 
+.IR lmbench , 
+it is possible to quickly answer questions that arise during system
+design, development, or tuning.  For example, image processing 
+.LP
+There are two attributes that are critical for performance, latency 
+and bandwidth, and 
+.IR lmbench 's 
+timing harness makes it easy to measure and report results for both.  
+The measurement interface, 
+.B benchmp
+is the same, but the reporting functions are different.
+Latency is usually important for frequently executed operations, and
+bandwidth is usually important when moving large chunks of data.
+.TP
+.B "void	milli(char *s, uint64 n)"
+print out the time per operation in milli-seconds.  
+.I n 
+is the number of operations during the timing interval, which is passed 
+as a parameter because each
+.I loop_body
+can contain several operations.
+.TP
+.B "void	micro(char *s, uint64 n)"
+print the time per opertaion in micro-seconds.
+.TP
+.B "void	nano(char *s, uint64 n)"
+print the time per operation in nano-seconds.
+.TP
+.B "void	mb(uint64 bytes)"
+print the bandwidth in megabytes per second.
+.TP
+.B "void	kb(uint64 bytes)"
+print the bandwidth in kilobytes per second.
+.SH "FUTURES"
+Development of 
+.I lmbench 
+is continuing.  
+.SH "SEE ALSO"
+lmbench(8), lmbench(3), timing(3), results(3)
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/results.3 b/performance/lmbench3/doc/results.3
new file mode 100644
index 0000000..b6d099d
--- /dev/null
+++ b/performance/lmbench3/doc/results.3
@@ -0,0 +1,88 @@
+.\"
+.\" @(#)results.man	2.0 98/04/24
+.\"
+.\"   results - lmbench results subsystem
+.\"
+.\"   Copyright (C) 1998  Carl Staelin and Larry McVoy
+.\"   E-mail: staelin@xxxxxxxxxx
+.\"
+.TH "lmbench result management" 3 "$Date:$" "(c)1998 Larry McVoy" "LMBENCH"
+.SH "NAME"
+insertinit, insertsort, get_results, set_results, save_median, save_minimum
+	\- the lmbench results subsystem
+.SH "SYNOPSIS"
+.B "#include ``lmbench.h''"
+.LP
+.B "#define TRIES 11"
+.LP
+.B "typedef struct { uint64 u, n } value_t;"
+.LP
+.B "typedef struct { int N; value_t v[TRIES]; } result_t;"
+.LP
+.B "int		sizeof_result(int N)"
+.LP
+.B "void	insertinit(result_t *r)"
+.LP
+.B "void	insertsort(uint64 u, uint64 n, result_t *r)"
+.LP
+.B "result_t*	get_results()"
+.LP
+.B "void	set_results(result_t *r)"
+.LP
+.B "void	save_median()"
+.LP
+.B "void	save_minimum()"
+.SH "DESCRIPTION"
+These routines provide some simple data management functionality.
+In most cases, you will not need these routines.
+.LP
+The current timing results can be accessed using the routines in
+timing(3).  The current timing results may be modified using 
+.B save_median 
+and 
+.BR save_minimum .  
+.TP
+.B "int		sizeof_result(int N)"
+returns the number of bytes to allocate for a result_t which contains
+.I N
+results.
+.TP
+.B "void	insertinit(result_t *r)"
+initializes the results array.
+.TP
+.B "void	insertsort(uint64 u, uint64 n, result_t *r)"
+insert 
+.I u 
+and 
+.I n 
+into 
+.IR r .  
+Results are sorted in decreasing order by 
+.IR u/n .
+.TP
+.B "void	get_results(result_t *r)"
+get a copy of the current results.
+.TP
+.B "void	set_results(result_t *r)"
+save a copy 
+.I r 
+as the current results.
+.TP
+.B "void	save_median()"
+sets the timing results to the median of the current results.
+.TP
+.B "void	save_minimum()"
+sets the timing restuls to the minimum of the current results.
+.LP
+Results are sorted in ascending order, so the minimum value is at 
+.B TRIES-1
+and the maximum value is at
+.BR 0 .
+.SH "FUTURES"
+Development of \fIlmbench\fR is continuing.  
+.SH "SEE ALSO"
+lmbench(8), lmbench(3), reporting(3), results(3)
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/stream.8 b/performance/lmbench3/doc/stream.8
new file mode 100644
index 0000000..762c710
--- /dev/null
+++ b/performance/lmbench3/doc/stream.8
@@ -0,0 +1,28 @@
+.\" $Id$
+.TH stream 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+stream \- John McCalpin's STREAM benchmark
+.SH SYNOPSIS
+.B stream
+[
+.I "-M <len>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B stream
+mimics John McCalpin's STREAM benchmark.  It measures memory bandwidth.
+.SH BUGS
+.B stream
+is an experimental benchmark, but it seems to work well on most
+systems.  
+.SH "SEE ALSO"
+lmbench(8), bw_mem(8), line(8), tlb(8), cache(8), par_mem(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/timing.3 b/performance/lmbench3/doc/timing.3
new file mode 100644
index 0000000..2ebea3a
--- /dev/null
+++ b/performance/lmbench3/doc/timing.3
@@ -0,0 +1,163 @@
+.\"
+.\" @(#)timing.man	2.0 98/04/24
+.\"
+.\"   timing - lmbench timing subsystem
+.\"
+.\"   Copyright (C) 1998  Carl Staelin and Larry McVoy
+.\"   E-mail: staelin@xxxxxxxxxx
+.\"
+.TH "lmbench timing" 3 "$Date:$" "(c)1998 Larry McVoy" "LMBENCH"
+
+.SH "NAME"
+benchmp, benchmp_getstate, benchmp_interval, 
+	start, stop, get_n, set_n, gettime, settime,
+	get_enough, t_overhead, l_overhead \- the lmbench timing subsystem
+.SH "SYNOPSIS"
+.B "#include ``lmbench.h''"
+.LP
+.B "typedef u_long	iter_t"
+.LP
+.B "typedef (*bench_f)(iter_t iterations, void* cookie)"
+.LP
+.B "typedef (*support_f)(iter_t iterations, void* cookie)"
+.LP
+.B "void	benchmp(support_f initialize, bench_f benchmark, support_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie)"
+.LP
+.B "void* benchmp_getstate()"
+.LP
+.B "iter_t benchmp_interval(void* state)"
+.LP
+.B "void	start(struct timeval *begin)"
+.LP
+.B "uint64	stop(struct timeval *begin, struct timeval *end)"
+.LP
+.B "uint64	get_n()"
+.LP
+.B "void	set_n(uint64 n)"
+.LP
+.B "uint64	gettime()"
+.LP
+.B "void	settime(uint64 u)"
+.LP
+.B "uint64	get_enough(uint64 enough)"
+.LP
+.B "uint64	t_overhead()"
+.LP
+.B "double	l_overhead()"
+.SH "DESCRIPTION"
+The single most important element of a good benchmarking system is
+the quality and reliability of its measurement system.  
+.IR lmbench 's
+timing subsystem manages the experimental timing process to produce
+accurate results in the least possible time.  
+.I lmbench 
+includes methods for measuring and eliminating several factors that 
+influence  the accuracy of timing measurements, such as the resolution 
+of the system clock.
+.LP
+.I lmbench 
+gets accurate results by considering clock resolution, 
+auto-sizing the duration of each benchmark, and conducting multiple
+experiments.  
+.TP
+.B "void	benchmp(initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie)"
+measures the performance of 
+.I benchmark
+repeatedly and reports the median result.  
+.I benchmp
+creates
+.I parallel
+sub-processes which run
+.I benchmark
+in parallel.  This allows lmbench to measure the system's ability to
+scale as the number of client processes increases.  Each sub-process
+executes
+.I initialize
+before starting the benchmarking cycle.  It will call
+.I benchmark
+several times in order to collect
+.I repetitions
+results.  After all the benchmark results have been collected, 
+.I cleanup
+is called to cleanup any resources which may have been allocated
+by 
+.I initialize
+or 
+.I benchmark .
+.I cookie 
+is a void pointer to a hunk of memory that can be used to store any
+parameters or state that is needed by the benchmark.
+.TP
+.B "void	benchmp_getstate()"
+returns a void pointer to the lmbench-internal state used during 
+benchmarking.  The state is not to be used or accessed directly
+by clients, but rather would be passed into
+.I benchmp_interval. 
+.TP
+.B "iter_t	benchmp_interval(void* state)"
+returns the number of times the benchmark should execute its
+benchmark loop during this timing interval.  This is used only
+for weird benchmarks which cannot implement the benchmark
+body in a function which can return, such as the page fault
+handler.  Please see 
+.I lat_sig.c 
+for sample usage.
+.TP
+.B "void	start(struct timeval *begin)"
+starts a timing interval.  If
+.I begin 
+is non-null, save the start time in 
+.I begin .
+.TP
+.B "uint64	stop(struct timeval *begin, struct timeval *end)"
+stops a timing interval, returning the number of elapsed micro-seconds.
+.TP
+.B "uint64	get_n()"
+returns the number of times 
+.I loop_body 
+was executed during the timing interval.
+.TP
+.B "void	set_n(uint64 n)"
+sets the number of times 
+.I loop_body 
+was executed during the timing interval.
+.TP
+.B "uint64	gettime()"
+returns the number of micro-seconds in the timing interval.
+.TP
+.B "void	settime(uint64 u)"
+sets the number of micro-seconds in the timing interval.
+.TP
+.B "uint64	get_enough(uint64 enough)"
+return the time in micro-seconds needed to accurately measure a timing
+interval. 
+.TP
+.B "uint64	t_overhead()"
+return the time in micro-seconds needed to measure time.
+.TP
+.B "double	l_overhead()"
+return the time in micro-seconds needed to do a simple loop.
+.SH "VARIABLES"
+There are three environment variables that can be used to modify
+the 
+.I lmbench 
+timing subsystem: ENOUGH, TIMING_O, and LOOP_O.
+The environment variables can be used to directly set the results
+of 
+.B get_enough , 
+.B t_overhead , 
+and 
+.B l_overhead .
+When running a large number of benchmarks, or repeating the same
+benchmark many times, this can save time by eliminating the necessity
+of recalculating these values for each run.
+.SH "FUTURES"
+Development of 
+.I lmbench 
+is continuing.  
+.SH "SEE ALSO"
+lmbench(8), lmbench(3), reporting(3), results(3).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/tlb.8 b/performance/lmbench3/doc/tlb.8
new file mode 100644
index 0000000..b95920b
--- /dev/null
+++ b/performance/lmbench3/doc/tlb.8
@@ -0,0 +1,55 @@
+.\" $Id$
+.TH TLB 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH"
+.SH NAME
+tlb \- TLB size and latency benchmark
+.SH SYNOPSIS
+.B tlb
+[
+.I "-L <line size>"
+]
+[
+.I "-M <len>"
+]
+[
+.I "-W <warmups>"
+]
+[
+.I "-N <repetitions>"
+]
+.SH DESCRIPTION
+.B tlb
+tries to determine the size, in pages, of the TLB.  
+The largest amount of memory it will examine is 
+.I len
+bytes.  
+.LP
+.B tlb
+compares the memory latency for two different pointer chains.
+The two chains occupy the same amount of cache space, but they stress
+the memory subsystem differently.  The first chain accesses one word
+per page, while the second chain 
+randomly jumps through all the lines on a page before jumping to the
+next page.  When all of the pointers reside in the cache (which is the
+usual case), and all of the pages for the first chain reside in the
+TLB, then the average memory latencies should be identical.  Assuming
+there is a fixed size TLB, then at some point the number of pages
+accessed by the first page will be larger than the TLB.  At this point
+the average latency for each memory access for the first chain will be
+a cache hit plus some fraction of a TLB miss.  
+.LP
+Once the TLB boundary is located 
+.B tlb
+reports the TLB miss latency as the TLB latency for twice as many
+pages as the TLB can hold.
+.SH BUGS
+.B tlb
+is an experimental benchmark, but it seems to work well on most
+systems.  However, if a processor has a TLB hierarchy
+.B tlb
+only finds the top level TLB.
+.SH "SEE ALSO"
+lmbench(8), line(8), cache(8), par_mem(8).
+.SH "AUTHOR"
+Carl Staelin and Larry McVoy
+.PP
+Comments, suggestions, and bug reports are always welcome.
diff --git a/performance/lmbench3/doc/tmac.usenix b/performance/lmbench3/doc/tmac.usenix
new file mode 100644
index 0000000..e66ac1f
--- /dev/null
+++ b/performance/lmbench3/doc/tmac.usenix
@@ -0,0 +1,1848 @@
+.ig
+Copyright (C) 1990, 1991 Free Software Foundation, Inc.
+     Written by James Clark (jjc@xxxxxxxxxxx)
+
+This file is part of groff.
+
+groff is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 1, or (at your option) any later
+version.
+
+groff is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License along
+with groff; see the file LICENSE.  If not, write to the Free Software
+Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+..
+.if !\n(.g .ab These ms macros require groff.
+.if \n(.C \
+.	ab The groff ms macros do not work in compatibility mode.
+.\" Enable warnings. You can delete this if you want.
+.warn
+.\" See if already loaded.
+.if r GS .nx /dev/null
+.nr GS 1
+.de @error
+.tm \\n(.F:\\n(.c: macro error: \\$*
+..
+.de @warning
+.tm \\n(.F:\\n(.c: macro warning: \\$*
+..
+.de @fatal
+.ab \\n(.F:\\n(.c: fatal macro error: \\$*
+..
+.de @not-implemented
+.@error sorry, \\$0 not implemented
+.als \\$0 @nop
+..
+.als TM @not-implemented
+.als CT @not-implemented
+.de @nop
+..
+.de @init
+.\" a non-empty environment
+.ev ne
+\c
+.ev
+.ev nf
+'nf
+.ev
+..
+.ds REFERENCES References
+.ds ABSTRACT ABSTRACT
+.ds TOC Table of Contents
+.ds MONTH1 January
+.ds MONTH2 February
+.ds MONTH3 March
+.ds MONTH4 April
+.ds MONTH5 May
+.ds MONTH6 June
+.ds MONTH7 July
+.ds MONTH8 August
+.ds MONTH9 September
+.ds MONTH10 October
+.ds MONTH11 November
+.ds MONTH12 December
+.ds MO \\*[MONTH\n[mo]]
+.nr *year \n[yr]+1900
+.ds DY \n[dy] \*[MO] \n[*year]
+.de ND
+.if \\n[.$] .ds DY "\\$*
+..
+.de DA
+.if \\n[.$] .ds DY "\\$*
+.ds CF \\*[DY]
+..
+.\" indexing
+.de IX
+.tm \\$1\t\\$2\t\\$3\t\\$4 ... \\n[PN]
+..
+.\" print an error message and then try to recover
+.de @error-recover
+.@error \\$@ (recovering)
+.nr *pop-count 0
+.while !'\\n(.z'' \{\
+.	\"@warning automatically terminating diversion \\n(.z
+.	ie d @div-end!\\n(.z .@div-end!\\n(.z
+.	el .*div-end-default
+.	nr *pop-count +1
+.	\" ensure that we don't loop forever
+.	if \\n[*pop-count]>20 .@fatal recovery failed
+.\}
+.while !'\\n[.ev]'0' .ev
+.par@reset-env
+.par@reset
+..
+.de *div-end-default
+.ds *last-div \\n(.z
+.br
+.di
+.ev nf
+.\\*[*last-div]
+.ev
+..
+.\" ****************************
+.\" ******** module cov ********
+.\" ****************************
+.\" Cover sheet and first page.
+.de cov*err-not-after-first-page
+.@error \\$0 is not allowed after the first page has started
+..
+.de cov*err-not-before-tl
+.@error \\$0 is not allowed before TL
+..
+.de cov*err-not-again
+.@error \\$0 is not allowed more than once
+..
+.de cov*err-not-after-ab
+.@error \\$0 is not allowed after first AB, LP, PP, IP, SH or NH
+..
+.als AU cov*err-not-before-tl
+.als AI cov*err-not-before-tl
+.als AB cov*err-not-before-tl
+.de cov*first-page-init
+.rm cov*first-page-init
+.par@init
+.als RP cov*err-not-after-first-page
+.@init
+.ie \\n[cov*rp-format] \{\
+.	pg@cs-top
+.	als FS cov*FS
+.	als FE cov*FE
+.\}
+.el \{\
+.	pg@top
+.	als FS @FS
+.	als FE @FE
+.\}
+.wh 0 pg@top
+..
+.wh 0 cov*first-page-init
+.\" This handles the case where FS occurs before TL or LP.
+.de FS
+.br
+\\*[FS]\\
+..
+.nr cov*rp-format 0
+.nr cov*rp-no 0
+.\" released paper format
+.de RP
+.nr cov*rp-format 1
+.if \\n[.$] .if '\\$1'no' .nr cov*rp-no 1
+.pn 0
+..
+.de TL
+.br
+.als TL cov*err-not-again
+.rn @AB AB
+.rn @AU AU
+.rn @AI AI
+.di cov*tl-div
+.par@reset
+.ft 3
+.ie \\n[VARPS] \{\
+.	ps 14
+.	vs 16
+.\}
+.el \{\
+.	ps +2
+.	vs +3p
+.\}
+.ll (u;\\n[LL]*5/6)
+.nr cov*n-au 0
+..
+.de @AU
+.par@reset
+.if !'\\n(.z'' \{\
+.	br
+.	di
+.\}
+.nr cov*n-au +1
+.di cov*au-div!\\n[cov*n-au]
+.nf
+.ft 2
+.ps \\n[PS]
+..
+.de @AI
+.par@reset
+.if !'\\n(.z'' \{\
+.	br
+.	di
+.\}
+.ie !\\n[cov*n-au] .@error AI before AU
+.el \{\
+.	di cov*ai-div!\\n[cov*n-au]
+.	nf
+.	ft 1
+.	ps \\n[PS]
+.\}
+..
+.de LP
+.if !'\\n[.z]'' \{\
+.	br
+.	di
+.\}
+.br
+.cov*ab-init
+.cov*print
+\\*[\\$0]\\
+..
+.als IP LP
+.als PP LP
+.als XP LP
+.als NH LP
+.als SH LP
+.als MC LP
+.als RT LP
+.de cov*ab-init
+.als cov*ab-init @nop
+.als LP @LP
+.als IP @IP
+.als PP @PP
+.als XP @XP
+.als RT @RT
+.als SH @SH
+.als NH @NH
+.als QP @QP
+.als RS @RS
+.als RE @RE
+.als QS @QS
+.als QE @QE
+.als MC @MC
+.als EQ @EQ
+.als EN @EN
+.als AB cov*err-not-after-ab
+.als AU par@AU
+.als AI par@AI
+.als TL par@TL
+..
+.de @AB
+.if !'\\n(.z'' \{\
+.	br
+.	di
+.\}
+.cov*ab-init
+.di cov*ab-div
+.par@ab-indent
+.par@reset
+.if !'\\$1'no' \{\
+.	ft 2
+.	ce 1
+\\*[ABSTRACT]
+.	sp
+.	ft 1
+.\}
+.ns
+.@PP
+..
+.de AE
+.ie '\\n(.z'cov*ab-div' \{\
+.	als AE cov*err-not-again
+.	br
+.	di
+.\"	nr cov*ab-height \\n[dn]
+.	par@reset-env
+.	par@reset
+.	cov*print
+.\}
+.el .@error AE without AB
+..
+.de @div-end!cov*ab-div
+.AE
+..
+.de cov*print
+.als cov*print @nop
+.ie d cov*tl-div \{\
+.	ie \\n[cov*rp-format] .cov*rp-print
+.	el .cov*draft-print
+.\}
+.el \{\
+.	if \\n[cov*rp-format] \{\
+.		@warning RP format but no TL
+.		bp 1
+.		als FS @FS
+.		als FE @FE
+.	\}
+.	br
+.\}
+..
+.de cov*rp-print
+.nr cov*page-length \\n[.p]
+.pl 1000i
+.cov*tl-au-print
+.sp 3
+.if d cov*ab-div \{\
+.	nf
+.	cov*ab-div
+.\}
+.sp 3
+.par@reset
+\\*[DY]
+.br
+.if \\n[cov*fn-height] \{\
+.	sp |(u;\\n[cov*page-length]-\\n[FM]\
+-\\n[cov*fn-height]-\\n[fn@sep-dist]>?\\n[nl])
+.	fn@print-sep
+.	ev nf
+.	cov*fn-div
+.	ev
+.	ie \\n[cov*rp-no] .rm cov*fn-div
+.	el \{\
+.		rn cov*fn-div fn@overflow-div
+.		nr fn@have-overflow 1
+.	\}
+.\}
+.als FS @FS
+.als FE @FE
+.\" If anything was printed below where the footer line is normally printed,
+.\" then that's an overflow.
+.if -\\n[FM]/2+1v+\\n[cov*page-length]<\\n[nl] .@error cover sheet overflow
+.pl \\n[cov*page-length]u
+.bp 1
+.if !\\n[cov*rp-no] .cov*tl-au-print
+.rs
+.sp 1
+..
+.de cov*draft-print
+.cov*tl-au-print
+.if d cov*ab-div \{\
+.	nf
+.	sp 2
+.	cov*ab-div
+.\}
+.sp 1
+..
+.de cov*tl-au-print
+.par@reset
+.nf
+.rs
+.sp 3
+.ce 9999
+.cov*tl-div
+.nr cov*i 1
+.nr cov*sp 1v
+.while \\n[cov*i]<=\\n[cov*n-au] \{\
+.	sp \\n[cov*sp]u
+.	cov*au-div!\\n[cov*i]
+.	ie d cov*ai-div!\\n[cov*i] \{\
+.		sp .5v	
+.		cov*ai-div!\\n[cov*i]
+.		nr cov*sp 1v
+.	\}
+.	el .nr cov*sp .5v
+.	nr cov*i +1
+.\}
+.ce 0
+..
+.nr cov*fn-height 0
+.nr cov*in-fn 0
+.\" start of footnote on cover
+.de cov*FS
+.if \\n[cov*in-fn] \{\
+.	@error nested FS
+.	FE
+.\}
+.nr cov*in-fn 1
+.ev fn
+.par@reset-env
+.da cov*fn-div
+.if !\\n[cov*fn-height] .ns
+.ie \\n[.$] .FP "\\$1" no
+.el .@LP
+..
+.de @div-end!cov*fn-div
+.cov*FE
+..
+.\" end of footnote on cover
+.de cov*FE
+.ie '\\n(.z'cov*fn-div' \{\
+.	br
+.	ev
+.	di
+.	nr cov*in-fn 0
+.	nr cov*fn-height +\\n[dn]
+.\}
+.el .@error FE without matching FS
+..
+.\" ***************************
+.\" ******** module pg ********
+.\" ***************************
+.\" Page-level formatting.
+.\" > 0 if we have a footnote on the current page
+.nr pg@fn-flag 0
+.nr pg@colw 0
+.nr pg@fn-colw 0
+.nr HM 1i
+.nr FM 1i
+.nr PO 1.25i
+.ds LF
+.ds CF
+.ds RF
+.ds LH
+.ds CH -\\n[PN]-
+.ds RH
+.ds pg*OH '\\*[LH]'\\*[CH]'\\*[RH]'
+.ds pg*EH '\\*[LH]'\\*[CH]'\\*[RH]'
+.ds pg*OF '\\*[LF]'\\*[CF]'\\*[RF]'
+.ds pg*EF '\\*[LF]'\\*[CF]'\\*[RF]'
+.de OH
+.ds pg*\\$0 "\\$*
+..
+.als EH OH
+.als OF OH
+.als EF OH
+.de PT
+.ie \\n%=1 .if \\n[pg*P1] .tl \\*[pg*OH]
+.el \{\
+.	ie o .tl \\*[pg*OH]
+.	el .tl \\*[pg*EH]
+.\}
+..
+.de BT
+.ie o .tl \\*[pg*OF]
+.el .tl \\*[pg*EF]
+..
+.nr pg*P1 0
+.de P1
+.nr pg*P1 1
+..
+.wh -\n[FM]u pg@bottom
+.wh -\n[FM]u/2u pg*footer
+.nr MINGW 2n
+.nr pg@ncols 1
+.de @MC
+.if !'\\n(.z'' .error-recover MC while diversion open
+.br
+.ie \\n[pg@ncols]>1 .pg@super-eject
+.el \{\
+.	\" flush out any floating keeps
+.	while \\n[kp@tail]>\\n[kp@head] \{\
+.		rs
+.		bp
+.	\}
+.\}
+.ie !\\n(.$ \{\
+.	nr pg@colw \\n[LL]*7/15
+.	nr pg*gutw \\n[LL]-(2*\\n[pg@colw])
+.	nr pg@ncols 2
+.\}
+.el \{\
+.	nr pg@colw (n;\\$1)<?\\n[LL]
+.	ie \\n[.$]<2 .nr pg*gutw \\n[MINGW]
+.	el .nr pg*gutw (n;\\$2)
+.	nr pg@ncols \\n[LL]-\\n[pg@colw]/(\\n[pg@colw]+\\n[pg*gutw])+1
+.	ie \\n[pg@ncols]>1 \
+.		nr pg*gutw \\n[LL]-(\\n[pg@ncols]*\\n[pg@colw])/(\\n[pg@ncols]-1)
+.	el .nr pg*gutw 0
+.\}
+.mk pg*col-top
+.ns
+.nr pg*col-num 0
+.nr pg@fn-colw \\n[pg@colw]*5/6
+.par@reset
+..
+.de 2C
+.MC
+..
+.de 1C
+.MC \\n[LL]u
+..
+.\" top of page macro
+.de pg@top
+.ch pg*footer -\\n[FM]u/2u
+.nr PN \\n%
+.nr pg*col-num 0
+.nr pg@fn-bottom-margin 0
+.nr pg*saved-po \\n[PO]
+.po \\n[PO]u
+.ev h
+.par@reset
+.sp (u;\\n[HM]/2)
+.PT
+.sp |\\n[HM]u
+.if d HD .HD
+.mk pg@header-bottom
+.ev
+.mk pg*col-top
+.pg*start-col
+..
+.de pg*start-col
+.\" Handle footnote overflow before floating keeps, because the keep
+.\" might contain an embedded footnote.
+.fn@top-hook
+.kp@top-hook
+.tbl@top-hook
+.ns
+..
+.de pg@cs-top
+.sp \\n[HM]u
+.\" move pg@bottom and pg*footer out of the way
+.ch pg@bottom \\n[.p]u*2u
+.ch pg*footer \\n[.p]u*2u
+.ns
+..
+.de pg@bottom
+.tbl@bottom-hook
+.if \\n[pg@fn-flag] .fn@bottom-hook
+.nr pg*col-num +1
+.ie \\n[pg*col-num]<\\n[pg@ncols] .pg*end-col
+.el .pg*end-page
+..
+.de pg*end-col
+'sp |\\n[pg*col-top]u
+.po (u;\\n[pg*saved-po]+(\\n[pg@colw]+\\n[pg*gutw]*\\n[pg*col-num]))
+.\"po +(u;\\n[pg@colw]+\\n[pg*gutw])
+.pg*start-col
+..
+.de pg*end-page
+.po \\n[pg*saved-po]u
+.\" Make sure we don't exit if there are still floats or footnotes left-over.
+.ie \\n[kp@head]<\\n[kp@tail]:\\n[fn@have-overflow] \{\
+.	\" Switching environments ensures that we don't get an unnecessary
+.	\" blank line at the top of the page.
+.	ev ne
+'	bp
+.	ev
+.\}
+.el \{\
+.	if r pg*next-number \{\
+.		pn \\n[pg*next-number]
+.		rr pg*next-number
+.		if d pg*next-format \{\
+.			af PN \\*[pg*next-format]
+.			rm pg*next-format
+.		\}
+.	\}
+'	bp
+.\}
+..
+.\" pg@begin number format
+.de pg@begin
+.ie \\n[.$]>0 \{\
+.	nr pg*next-number (;\\$1)
+.	ie \\n[.$]>1 .ds pg*next-format \\$2
+.	el .rm pg*next-format
+.\}
+.el .rr pg*next-number
+.pg@super-eject
+..
+.\" print the footer line
+.de pg*footer
+.ev h
+.par@reset
+.BT
+.ev
+..
+.\" flush out any keeps or footnotes
+.de pg@super-eject
+.br
+.if !'\\n(.z'' .@error-recover diversion open while ejecting page
+.\" Make sure we stay in the end macro while there is still footnote overflow
+.\" left, or floating keeps.
+.while \\n[kp@tail]>\\n[kp@head]:\\n[pg@fn-flag] \{\
+.	rs
+.	bp
+.\}
+.bp
+..
+.em pg@super-eject
+.\" ***************************
+.\" ******** module fn ********
+.\" ***************************
+.\" Footnotes.
+.nr fn@sep-dist 8p
+.ev fn
+.\" Round it vertically
+.vs \n[fn@sep-dist]u
+.nr fn@sep-dist \n[.v]
+.ev
+.nr fn*text-num 0 1
+.nr fn*note-num 0 1
+.ds * \\*[par@sup-start]\En+[fn*text-num]\\*[par@sup-end]
+.nr fn*open 0
+.\" normal FS
+.de @FS
+.ie \\n[.$] .fn*do-FS "\\$1" no
+.el \{\
+.	ie \\n[fn*text-num]>\\n[fn*note-num] .fn*do-FS \\n+[fn*note-num]
+.	el .fn*do-FS
+.\}
+..
+.\" Second argument of `no' means don't embellish the first argument.
+.de fn*do-FS
+.if \\n[fn*open] .@error-recover nested FS
+.nr fn*open 1
+.if \\n[.u] \{\
+.	\" Ensure that the first line of the footnote is on the same page
+.	\" as the reference.  I think this is minimal.
+.	ev fn
+.	nr fn*need 1v
+.	ev
+.	ie \\n[pg@fn-flag] .nr fn*need +\\n[fn:PD]
+.	el .nr fn*need +\\n[fn@sep-dist]
+.	ne \\n[fn*need]u+\\n[.V]u>?0
+.\}
+.ev fn
+.par@reset-env
+.fn*start-div
+.par@reset
+.ie \\n[.$] .FP \\$@
+.el .@LP
+..
+.de @FE
+.ie !\\n[fn*open] .@error FE without FS
+.el \{\
+.	nr fn*open 0
+.	br
+.	ev
+.	fn*end-div
+.\}
+..
+.nr fn@have-overflow 0
+.\" called at the top of each column
+.de fn@top-hook
+.nr fn*max-width 0
+.nr fn*page-bottom-pos 0-\\n[FM]-\\n[pg@fn-bottom-margin]
+.ch pg@bottom \\n[fn*page-bottom-pos]u
+.if \\n[fn@have-overflow] \{\
+.	nr fn@have-overflow 0
+.	fn*start-div
+.	ev nf
+.	fn@overflow-div
+.	ev
+.	fn*end-div
+.\}
+..
+.\" This is called at the bottom of the column if pg@fn-flag is set.
+.de fn@bottom-hook
+.nr pg@fn-flag 0
+.nr fn@have-overflow 0
+.nr fn@bottom-pos \\n[.p]-\\n[FM]-\\n[pg@fn-bottom-margin]+\\n[.v]
+.ev fn
+.nr fn@bottom-pos -\\n[.v]
+.ev
+.ie \\n[nl]+\\n[fn@sep-dist]+\n[.V]>\\n[fn@bottom-pos] \{\
+.	rn fn@div fn@overflow-div
+.	nr fn@have-overflow 1
+.\}
+.el \{\
+.	if \\n[pg@ncols]>1 \
+.		if \\n[fn*max-width]>\\n[pg@fn-colw] \
+.			nr pg@fn-bottom-margin \\n[.p]-\\n[FM]-\\n[nl]+1v
+.	wh \\n[fn@bottom-pos]u fn*catch-overflow
+.	fn@print-sep
+.	ev nf
+.	fn@div
+.	rm fn@div
+.	ev
+.	if '\\n(.z'fn@overflow-div' \{\
+.		di
+.		nr fn@have-overflow \\n[dn]>0
+.	\}
+.	ch fn*catch-overflow
+.\}
+..
+.de fn*catch-overflow
+.di fn@overflow-div
+..
+.nr fn*embed-count 0
+.de @div-end!fn@div
+.br
+.if '\\n[.ev]'fn' .ev
+.fn*end-div
+.nr fn*open 0
+..
+.als @div-end!fn*embed-div @div-end!fn@div
+.de fn*start-div
+.ie '\\n(.z'' \{\
+.	da fn@div
+.	if !\\n[pg@fn-flag] .ns
+.\}
+.el .di fn*embed-div
+..
+.de fn*end-div
+.ie '\\n(.z'fn@div' \{\
+.	di
+.	nr fn*page-bottom-pos -\\n[dn]
+.	nr fn*max-width \\n[fn*max-width]>?\\n[dl]
+.	if !\\n[pg@fn-flag] .nr fn*page-bottom-pos -\\n[fn@sep-dist]
+.	nr pg@fn-flag 1
+.	nr fn*page-bottom-pos \\n[nl]-\\n[.p]+\n[.V]>?\\n[fn*page-bottom-pos]
+.	ch pg@bottom \\n[fn*page-bottom-pos]u
+.\}
+.el \{\
+.	ie '\\n(.z'fn*embed-div' \{\
+.	di
+.		rn fn*embed-div fn*embed-div!\\n[fn*embed-count]
+\!.		fn*embed-start \\n[fn*embed-count]
+.		rs
+'		sp (u;\\n[dn]+\\n[fn@sep-dist]+\\n[.V])
+\!.		fn*embed-end
+.		nr fn*embed-count +1
+.	\}
+.	el \{\
+.		ev fn
+.		@error-recover unclosed diversion within footnote
+.	\}
+.\}
+..
+.de fn*embed-start
+.ie '\\n(.z'' \{\
+.	fn*start-div
+.	ev nf
+.	fn*embed-div!\\$1
+.	rm fn*embed-div!\\$1
+.	ev
+.	fn*end-div
+.	di fn*null
+.\}
+.el \{\
+\!.	fn*embed-start \\$1
+.	rs
+.\}
+..
+.de fn*embed-end
+.ie '\\n(.z'fn*null' \{\
+.	di
+.	rm fn*null
+.\}
+.el \!.fn*embed-end
+..
+.\" It's important that fn@print-sep use up exactly fn@sep-dist vertical space.
+.de fn@print-sep
+.ev fn
+.in 0
+.vs \\n[fn@sep-dist]u
+\D'l 1i 0'
+.br
+.ev
+..
+.\" ***************************
+.\" ******** module kp ********
+.\" ***************************
+.\" Keeps.
+.de KS
+.br
+.di kp*div
+..
+.de KF
+.if !'\\n(.z'' .@error-recover KF while open diversion
+.di kp*fdiv
+.ev k
+.par@reset-env
+.par@reset
+..
+.de KE
+.ie '\\n(.z'kp*div' .kp*end
+.el \{\
+.	ie '\\n(.z'kp*fdiv' .kp*fend
+.	el .@error KE without KS or KF
+.\}
+..
+.de @div-end!kp*div
+.kp*end
+..
+.de @div-end!kp*fdiv
+.kp*fend
+..
+.de kp*need
+.ie '\\n(.z'' .ds@need \\$1
+.el \!.kp*need \\$1
+..
+.\" end non-floating keep
+.de kp*end
+.br
+.di
+.kp*need \\n[dn]
+.ev nf
+.kp*div
+.ev
+.rm kp*div
+..
+.\" Floating keeps.
+.nr kp@head 0
+.nr kp@tail 0
+.\" end floating keep
+.de kp*fend
+.br
+.ev
+.di
+.ie \\n[.t]-(\\n[.k]>0*1v)>\\n[dn] \{\
+.	br
+.	ev nf
+.	kp*fdiv
+.	rm kp*fdiv
+.	ev
+.\}
+.el \{\
+.	rn kp*fdiv kp*div!\\n[kp@tail]
+.	nr kp*ht!\\n[kp@tail] 0\\n[dn]
+.	nr kp@tail +1
+.\}
+..
+.\" top of page processing for KF
+.nr kp*doing-top 0
+.de kp@top-hook
+.if !\\n[kp*doing-top] \{\
+.	nr kp*doing-top 1
+.	kp*do-top
+.	nr kp*doing-top 0
+.\}
+..
+.de kp*do-top
+.\" If the first keep won't fit, only force it out if we haven't had a footnote
+.\" and we're at the top of the page.
+.nr kp*force \\n[pg@fn-flag]=0&(\\n[nl]<=\\n[pg@header-bottom])
+.nr kp*fits 1
+.while \\n[kp@tail]>\\n[kp@head]&\\n[kp*fits] \{\
+.	ie \\n[.t]>\\n[kp*ht!\\n[kp@head]]:\\n[kp*force] \{\
+.		nr kp*force 0
+.		\" It's important to advance kp@head before bringing
+.		\" back the keep, so that if the last line of the
+.		\" last keep springs the bottom of page trap, a new
+.		\" page will not be started unnecessarily.
+.		rn kp*div!\\n[kp@head] kp*temp
+.		nr kp@head +1
+.		ev nf
+.		kp*temp
+.		ev
+.		rm kp*temp
+.	\}
+.	el .nr kp*fits 0
+.\}
+..
+.\" ***************************
+.\" ******** module ds ********
+.\" ***************************
+.\" Displays and non-floating keeps.
+.de DE
+.ds*end!\\n[\\n[.ev]:ds-type]
+.nr \\n[.ev]:ds-type 0
+..
+.de ds@auto-end
+.if \\n[\\n[.ev]:ds-type] \{\
+.	@error automatically terminating display
+.	DE
+.\}
+..
+.de @div-end!ds*div
+.ie \\n[\\n[.ev]:ds-type] .DE
+.el .ds*end!2
+..
+.de ds*end!0
+.@error DE without DS, ID, CD, LD or BD
+..
+.de LD
+.br
+.nr \\n[.ev]:ds-type 1
+.par@reset
+.nf
+.sp \\n[DD]u
+..
+.de ID
+.LD
+.ie \\n[.$] .in +(n;\\$1)
+.el .in +\\n[DI]u
+..
+.de CD
+.LD
+.ce 9999
+..
+.de RD
+.LD
+.rj 9999
+..
+.de ds*common-end
+.par@reset
+.sp \\n[DD]u
+..
+.als ds*end!1 ds*common-end
+.de BD
+.LD
+.nr \\n[.ev]:ds-type 2
+.di ds*div
+..
+.de ds*end!2
+.br
+.ie '\\n(.z'ds*div' \{\
+.	di
+.	nf
+.	in (u;\\n[.l]-\\n[dl]/2)
+.	ds*div
+.	rm ds*div
+.	ds*common-end
+.\}
+.el .@error-recover mismatched DE
+..
+.de DS
+.br
+.di ds*div
+.ie '\\$1'B' \{\
+.	LD
+.	nr \\n[.ev]:ds-type 4
+.\}
+.el \{\
+.	ie '\\$1'L' .LD
+.	el \{\
+.		ie '\\$1'C' .CD
+.		el \{\
+.			ie '\\$1'R' .RD
+.			el \{\
+.				ie '\\$1'I' .ID \\$2
+.				el .ID \\$1
+.			\}
+.		\}
+.	\}
+.	nr \\n[.ev]:ds-type 3
+.\}
+..
+.de ds@need
+.if '\\n(.z'' \{\
+.	while \\n[.t]<=(\\$1)&(\\n[nl]>\\n[pg@header-bottom]) \{\
+.		rs
+'		sp \\n[.t]u
+.	\}
+.\}
+..
+.de ds*end!3
+.br
+.ie '\\n(.z'ds*div' \{\
+.	di
+.	ds@need \\n[dn]
+.	ev nf
+.	ds*div
+.	ev
+.	rm ds*div
+.	ds*common-end
+.\}
+.el .@error-recover mismatched DE
+..
+.de ds*end!4
+.ie '\\n(.z'ds*div' \{\
+.	br
+.	di
+.	nf
+.	in (u;\\n[.l]-\\n[dl]/2)
+.	ds@need \\n[dn]
+.	ds*div
+.	rm ds*div
+.	ds*common-end
+.\}
+.el .@error-recover mismatched DE
+..
+.\" ****************************
+.\" ******** module par ********
+.\" ****************************
+.\" Paragraph-level formatting.
+.nr VARPS 0
+.nr PS 10
+.nr LL 6i
+.de par*vs
+.\" If it's too big to be in points, treat it as units.
+.ie (p;\\$1)>=40p .vs (u;\\$1)
+.el .vs (p;\\$1)
+..
+.de par@ab-indent
+.nr 0:li (u;\\n[LL]/12)
+.nr 0:ri \\n[0:li]
+..
+.de par*env-init
+.aln \\n[.ev]:PS PS
+.aln \\n[.ev]:VS VS
+.aln \\n[.ev]:LL LL
+.aln \\n[.ev]:MCLL LL
+.aln \\n[.ev]:LT LT
+.aln \\n[.ev]:MCLT LT
+.aln \\n[.ev]:PI PI
+.aln \\n[.ev]:PD PD
+.par@reset-env
+..
+.\" happens when the first page begins
+.de par@init
+.if !rLT .nr LT \\n[LL]
+.if !rFL .nr FL \\n[LL]*5/6
+.if !rVS .nr VS \\n[PS]+2
+.ps \\n[PS]
+.if !rDI .nr DI .5i
+.if !rQI .nr QI 5n
+.if !rPI .nr PI 5n
+.par*vs \\n[VS]
+.if !rPD .nr PD .3v
+.if !rDD .nr DD .5v
+.if !dFAM .ds FAM \\n[.fam]
+.nr par*adj \\n[.j]
+.par*env-init
+.ev h
+.par*env-init
+.ev
+.ev fn
+.par*env-init
+.ev
+.ev k
+.par*env-init
+.ev
+.aln 0:MCLL pg@colw
+.aln 0:MCLT pg@colw
+.aln k:MCLL pg@colw
+.aln k:MCLT pg@colw
+.if !rFPS .nr FPS \\n[PS]-2
+.if !rFVS .nr FVS (p;\\n[FPS]+2)
+.if !rFI .nr FI 2n
+.if !rFPD .nr FPD \\n[PD]/2
+.aln fn:PS FPS
+.aln fn:VS FVS
+.aln fn:LL FL
+.aln fn:LT FL
+.aln fn:PI FI
+.aln fn:PD FPD
+.aln fn:MCLL pg@fn-colw
+.aln fn:MCLT pg@fn-colw
+..
+.de par@reset-env
+.nr \\n[.ev]:il 0
+.nr \\n[.ev]:li 0
+.nr \\n[.ev]:ri 0
+.nr \\n[.ev]:ai \\n[\\n[.ev]:PI]
+.nr \\n[.ev]:pli 0
+.nr \\n[.ev]:pri 0
+.nr \\n[.ev]:ds-type 0
+..
+.\" par@reset
+.de par@reset
+.br
+.ce 0
+.rj 0
+.ul 0
+.fi
+.ad \\n[par*adj]
+.ie \\n[pg@ncols]>1 \{\
+.	ll (u;\\n[\\n[.ev]:MCLL]-\\n[\\n[.ev]:ri]-\\n[\\n[.ev]:pri])
+.	lt \\n[\\n[.ev]:MCLT]u
+.\}
+.el \{\
+.	ll (u;\\n[\\n[.ev]:LL]-\\n[\\n[.ev]:ri]-\\n[\\n[.ev]:pri])
+.	lt \\n[\\n[.ev]:LT]u
+.\}
+.in (u;\\n[\\n[.ev]:li]+\\n[\\n[.ev]:pli])
+.ft 1
+.fam \\*[FAM]
+.ps \\n[\\n[.ev]:PS]
+.par*vs \\n[\\n[.ev]:VS]
+.ls 1
+.TA
+.hy 14
+..
+.als @RT par@reset
+.\" This can be redefined by the user.
+.de TA
+.ta T 5n
+..
+.de par*start
+.ds@auto-end
+.nr \\n[.ev]:pli \\$1
+.nr \\n[.ev]:pri \\$2
+.par@reset
+.sp \\n[\\n[.ev]:PD]u
+.ne 1v+\\n(.Vu
+..
+.de par@finish
+.nr \\n[.ev]:pli 0
+.nr \\n[.ev]:pri 0
+.par@reset
+..
+.\" normal LP
+.de @LP
+.par*start 0 0
+.nr \\n[.ev]:ai \\n[\\n[.ev]:PI]
+..
+.de @PP
+.par*start 0 0
+.nr \\n[.ev]:ai \\n[\\n[.ev]:PI]
+.ti +\\n[\\n[.ev]:ai]u
+..
+.de @QP
+.nr \\n[.ev]:ai \\n[\\n[.ev]:PI]
+.par*start \\n[QI] \\n[QI]
+..
+.de @XP
+.par*start \\n[\\n[.ev]:PI] 0
+.ti -\\n[\\n[.ev]:PI]u
+..
+.de @IP
+.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2)
+.par*start \\n[\\n[.ev]:ai] 0
+.if !'\\$1'' \{\
+.	\" Divert the label so as to freeze any spaces.
+.	di par*label
+.	in 0
+.	nf
+\&\\$1
+.	di
+.	in
+.	fi
+.	chop par*label
+.	ti -\\n[\\n[.ev]:ai]u
+.	ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c
+.	el \{\
+\\*[par*label]
+.	br
+.	\}
+.	rm par*label
+.\}
+..
+.de @RS
+.br
+.nr \\n[.ev]:li!\\n[\\n[.ev]:il] \\n[\\n[.ev]:li]
+.nr \\n[.ev]:ri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ri]
+.nr \\n[.ev]:ai!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ai]
+.nr \\n[.ev]:pli!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pli]
+.nr \\n[.ev]:pri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pri]
+.nr \\n[.ev]:il +1
+.nr \\n[.ev]:li +\\n[\\n[.ev]:ai]
+.nr \\n[.ev]:ai \\n[\\n[.ev]:PI]
+.par@reset
+..
+.de @RE
+.br
+.ie \\n[\\n[.ev]:il] \{\
+.	nr \\n[.ev]:il -1
+.	nr \\n[.ev]:ai \\n[\\n[.ev]:ai!\\n[\\n[.ev]:il]]
+.	nr \\n[.ev]:li \\n[\\n[.ev]:li!\\n[\\n[.ev]:il]]
+.	nr \\n[.ev]:ri \\n[\\n[.ev]:ri!\\n[\\n[.ev]:il]]
+.	nr \\n[.ev]:pli \\n[\\n[.ev]:pli!\\n[\\n[.ev]:il]]
+.	nr \\n[.ev]:pri \\n[\\n[.ev]:pri!\\n[\\n[.ev]:il]]
+.\}
+.el .@error unbalanced \\$0
+.par@reset
+..
+.\" ---------------------------------------------------------------------------
+.de LINE
+.	br
+.	ps 32
+\l'\\n[.l]u-\\n[\\n[.ev]:ri]u-\\n[\\n[.ev]:pri]u'
+.	ps
+..
+.\" ---------------------------------------------------------------------------
+.de QSTART
+.	nr SaveQI \\n[QI]
+.	if \\n[.$] .nr QI \\$1
+.	QS
+.	LINE
+.	ft 3
+..
+.\" ---------------------------------------------------------------------------
+.de QEND
+.	ft P
+.	sp -.5
+.	LINE
+.	QE
+.	nr QI \\n[SaveQI]
+.	if \\n[.$] .sp \\$1
+..
+.de @QS
+.br
+.nr \\n[.ev]:li!\\n[\\n[.ev]:il] \\n[\\n[.ev]:li]
+.nr \\n[.ev]:ri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ri]
+.nr \\n[.ev]:ai!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ai]
+.nr \\n[.ev]:pli!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pli]
+.nr \\n[.ev]:pri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pri]
+.nr \\n[.ev]:il +1
+.nr \\n[.ev]:li +\\n[QI]
+.nr \\n[.ev]:ri +\\n[QI]
+.nr \\n[.ev]:ai \\n[\\n[.ev]:PI]
+.par@reset
+..
+.als @QE @RE
+.\" start boxed text
+.de B1
+.br
+.di par*box-div
+.nr \\n[.ev]:li +1n
+.nr \\n[.ev]:ri +1n
+.par@reset
+..
+.de @div-end!par*box-div
+.B2
+..
+.\" end boxed text
+.\" Postpone the drawing of the box until we're in the top-level diversion,
+.\" in case there's a footnote inside the box.
+.de B2
+.ie '\\n(.z'par*box-div' \{\
+.	br
+.	di
+.	ds@need \\n[dn]
+.	par*box-mark-top
+.	ev nf
+.	par*box-div
+.	ev
+.	nr \\n[.ev]:ri -1n
+.	nr \\n[.ev]:li -1n
+.	par@finish
+.	par*box-draw \\n[.i]u \\n[.l]u
+.\}
+.el .@error B2 without B1
+..
+.de par*box-mark-top
+.ie '\\n[.z]'' .mk par*box-top
+.el \!.par*box-mark-top
+..
+.de par*box-draw
+.ie '\\n[.z]'' \{\
+.	nr par*box-in \\n[.i]
+.	nr par*box-ll \\n[.l]
+.	nr par*box-vpt \\n[.vpt]
+.	vpt 0
+.	in \\$1
+.	ll \\$2
+\v'-1v+.25m'\
+\D'l (u;\\n[.l]-\\n[.i]) 0'\
+\D'l 0 |\\n[par*box-top]u'\
+\D'l -(u;\\n[.l]-\\n[.i]) 0'\
+\D'l 0 -|\\n[par*box-top]u'
+.	br
+.	sp -1
+.	in \\n[par*box-in]u
+.	ll \\n[par*box-ll]u
+.	vpt \\n[par*box-vpt]
+.\}
+.el \!.par*box-draw \\$1 \\$2
+..
+.de @SH
+.par@finish
+.\" Keep together the heading and the first two lines of the next paragraph.
+.\" XXX - fix for variable PS.
+.ne 3v+\\n[\\n[.ev]:PD]u+\\n(.Vu
+.sp 1
+.ft 3
+.if \\n[VARPS] .ps \\n[PS]+2
+..
+.\" TL, AU, and AI are aliased to these in cov*ab-init.
+.de par@TL
+.par@finish
+.sp 1
+.ft 3
+.ps +2
+.vs +3p
+.ce 9999
+..
+.de par@AU
+.par@finish
+.sp 1
+.ft I
+.ce 9999
+..
+.de par@AI
+.par@finish
+.sp .5
+.ce 9999
+..
+.\" In paragraph macros.
+.de NL
+.ps \\n[\\n[.ev]:PS]
+..
+.de SM
+.ps -2
+..
+.de LG
+.ps +2
+..
+.de R
+.ft R
+..
+.de par*set-font
+.ie \\n[.$] \{\
+.	nr par*prev-font \\n[.f]
+\&\\$3\f[\\*[par*font-name!\\$0]]\\$1\f[\\n[par*prev-font]]\\$2
+.\}
+.el .ft \\*[par*font-name!\\$0]
+..
+.ds par*font-name!B 3
+.ds par*font-name!I 2
+.ds par*font-name!BI BI
+.ds par*font-name!CW CR
+.als B par*set-font
+.als I par*set-font
+.als BI par*set-font
+.als CW par*set-font
+.\" underline a word
+.de UL
+\Z'\\$1'\v'.25m'\D'l \w'\\$1'u 0'\v'-.25m'\\$2
+..
+.\" box a word
+.de BX
+.nr par*bxw \w'\\$1'+.4m
+\Z'\v'.25m'\D'l 0 -1m'\D'l \\n[par*bxw]u 0'\D'l 0 1m'\D'l -\\n[par*bxw]u 0''\
+\Z'\h'.2m'\\$1'\
+\h'\\n[par*bxw]u'
+..
+.\" The first time UX is used, put a registered mark after it.
+.ds par*ux-rg \(rg
+.de UX
+\s[\\n[.s]*8u/10u]UNIX\s0\\$1\\*[par*ux-rg]
+.ds par*ux-rg
+..
+.ds par@sup-start \v'-.9m\s'\En[.s]*7u/10u'+.7m'
+.als { par@sup-start
+.ds par@sup-end \v'-.7m\s0+.9m'
+.als } par@sup-end
+.\" footnote paragraphs
+.\" FF is the footnote format
+.nr FF 0
+.\" This can be redefined. It gets a second argument of `no' if the first
+.\" argument was supplied by the user, rather than automatically.
+.de FP
+.br
+.if !d par*fp!\\n[FF] \{\
+.	@error unknown footnote format `\\n[FF]'
+.	nr FF 0
+.\}
+.ie '\\$2'no' .par*fp!\\n[FF]-no "\\$1"
+.el .par*fp!\\n[FF] "\\$1"
+..
+.de par*fp!0
+.@PP
+\&\\*[par@sup-start]\\$1\\*[par@sup-end]\ \c
+..
+.de par*fp!0-no
+.@PP
+\&\\$1\ \c
+..
+.de par*fp!1
+.@PP
+\&\\$1.\ \c
+..
+.de par*fp!1-no
+.@PP
+\&\\$1\ \c
+..
+.de par*fp!2
+.@LP
+\&\\$1.\ \c
+..
+.de par*fp!2-no
+.@LP
+\&\\$1\ \c
+..
+.de par*fp!3
+.@IP "\\$1." (u;\\n[\\n[.ev]:PI]*2)
+..
+.de par*fp!3-no
+.@IP "\\$1" (u;\\n[\\n[.ev]:PI]*2)
+..
+.\" ***************************
+.\" ******** module nh ********
+.\" ***************************
+.\" Numbered headings.
+.\" nh*hl is the level of the last heading
+.nr nh*hl 0
+.\" numbered heading
+.de @NH
+.ie '\\$1'S' \{\
+.	shift
+.	nr nh*hl 0
+.	while \\n[.$] \{\
+.		nr nh*hl +1
+.		nr H\\n[nh*hl] 0\\$1
+.		shift
+.	\}
+.	if !\\n[nh*hl] \{\
+.		nr H1 1
+.		nr nh*hl 1
+.		@error missing arguments to .NH S
+.	\}
+.\}
+.el \{\
+.	nr nh*ohl \\n[nh*hl]
+.	ie \\n[.$] \{\
+.		nr nh*hl 0\\$1
+.		ie \\n[nh*hl]<=0 \{\
+.			nr nh*ohl 0
+.			nr nh*hl 1
+.		\}
+.		el \{\
+.			if \\n[nh*hl]-\\n[nh*ohl]>1 \
+.				@warning .NH \\n[nh*ohl] followed by .NH \\n[nh*hl]
+.		\}
+.	\}
+.	el .nr nh*hl 1
+.	while \\n[nh*hl]>\\n[nh*ohl] \{\
+.		nr nh*ohl +1
+.		nr H\\n[nh*ohl] 0
+.	\}
+.	nr H\\n[nh*hl] +1
+.\}
+.ds SN
+.nr nh*i 0
+.while \\n[nh*i]<\\n[nh*hl] \{\
+.	nr nh*i +1
+.	as SN \\n[H\\n[nh*i]].
+.\}
+.SH
+.if \\n[VARPS] \{\
+.		ps \\n[PS]+2
+.		ne 3
+.\}
+\\*[SN]
+..
+.de VARPS
+.nr VARPS 1
+..
+.\" ****************************
+.\" ******** module toc ********
+.\" ****************************
+.\" Table of contents generation.
+.de XS
+.da toc*div
+.ev h
+.par@reset
+.fi
+.ie \\n[.$] .XA "\\$1"
+.el .XA
+..
+.de @div-end!toc*div
+.XE
+..
+.de XA
+.ie '\\n(.z'toc*div' \{\
+.	if d toc*num .toc*end-entry
+.	ie \\n[.$] \{\
+.		ie '\\$1'no' .ds toc*num
+.		el .ds toc*num "\\$1
+.	\}
+.	el .ds toc*num \\n[PN]
+.	in (n;0\\$2)
+.\}
+.el .@error XA without XS
+..
+.de XE
+.ie '\\n(.z'toc*div' \{\
+.	if d toc*num .toc*end-entry
+.	ev
+.	di
+.\}
+.el .@error XS without XE
+..
+.de toc*end-entry
+\\a\\t\\*[toc*num]
+.br
+.rm toc*num
+..
+.de PX
+.1C
+.if !'\\$1'no' \{\
+.	ce 1
+.	ps \\n[PS]+2
+.	ft 3
+\\*[TOC]
+.	ft
+.	ps
+.\}
+.nf
+.char \[toc*leader-char] .\h'1m'
+.lc \[toc*leader-char]
+.ta (u;\\n[.l]-\\n[.i]-\w'000') (u;\\n[.l]-\\n[.i])R
+.sp 2
+.toc*div
+.par@reset
+..
+.\" print the table of contents on page i
+.de TC
+.P1
+.pg@begin 1 i
+.PX \\$1
+..
+.\" ****************************
+.\" ******** module eqn ********
+.\" ****************************
+.\" Eqn support.
+.de EQ
+..
+.de EN
+..
+.de @EQ
+.br
+.ds eqn*num "\\$2
+.ie '\\$1'L' .nr eqn*type 0
+.el \{\
+.	ie '\\$1'I' .nr eqn*type 1
+.	el \{\
+.		nr eqn*type 2
+.		if !'\\$1'C' .ds eqn*num "\\$1
+.	\}
+.\}
+.di eqn*div
+.in 0
+.nf
+..
+.de @div-end!eqn*div
+.@EN
+..
+.\" Note that geqn mark and lineup work correctly in centered equations.
+.de @EN
+.ie !'\\n(.z'eqn*div' .@error-recover mismatched EN
+.el \{\
+.	br
+.	di
+.	nr eqn*have-num 0
+.	if !'\\*[eqn*num]'' .nr eqn*have-num 1
+.	if \\n[dl]:\\n[eqn*have-num] \{\
+.		sp \\n[DD]u
+.		par@reset
+.		ds eqn*tabs \\n[.tabs]
+.		nf
+.		ie \\n[dl] \{\
+.			ds@need \\n[dn]u-1v+\n[.V]u
+.			chop eqn*div
+.			ie \\n[eqn*type]=0 \{\
+.				ta (u;\\n[.l]-\\n[.i])R
+\\*[eqn*div]\t\\*[eqn*num]
+.			\}
+.			el \{\
+.				ie \\n[eqn*type]=1 .ta \\n[DI]u \
+(u;\\n[.l]-\\n[.i])R
+.				el .ta (u;\\n[.l]-\\n[.i]/2)C \
+(u;\\n[.l]-\\n[.i])R
+\t\\*[eqn*div]\t\\*[eqn*num]
+.			\}
+.		\}
+.		el \{\
+.			ta (u;\\n[.l]-\\n[.i])R
+\t\\*[eqn*num]
+.		\}
+.		sp \\n[DD]u
+.		fi
+.		ta \\*[eqn*tabs]
+.	\}
+.\}
+..
+.\" ****************************
+.\" ******** module tbl ********
+.\" ****************************
+.\" Tbl support.
+.nr tbl*have-header 0
+.de TS
+.\" The break is necessary in the case where the first page has not yet begun.
+.br
+.sp \\n[DD]u
+.if '\\$1'H' .di tbl*header-div
+..
+.de tbl@top-hook
+.if \\n[tbl*have-header] \{\
+.	ie \\n[.t]-\\n[tbl*header-ht]-1v .tbl*print-header
+.	el .sp \\n[.t]u
+.\}
+..
+.de tbl*print-header
+.ev nf
+.tbl*header-div
+.ev
+.mk #T
+..
+.de TH
+.ie '\\n[.z]'tbl*header-div' \{\
+.	nr T. 0
+.	T#
+.	br
+.	di
+.	ie \\n[dn]+\\n[FM]+\\n[HM]+2v>=\\n[.p] \{\
+.		@error ridiculously long table header
+.		ds@need \\n[dn]
+.		tbl*print-header
+.	\}
+.	el \{\
+.		nr tbl*header-ht \\n[dn]
+.		ds@need \\n[dn]u+1v
+.		tbl*print-header
+.		nr tbl*have-header 1
+.	\}
+.\}
+.el .@error-recover .TH without .TS H
+..
+.de @div-end!tbl*header-div
+.TH
+.TE
+..
+.de TE
+.ie '\\n(.z'tbl*header-div' .@error-recover .TS H but no .TH before .TE
+.el \{\
+.	nr tbl*have-header 0
+.	sp \\n[DD]u
+.\}
+.\" reset tabs
+.TA
+..
+.de tbl@bottom-hook
+.if \\n[tbl*have-header] \{\
+.	nr T. 1
+.	T#
+.\}
+..
+.de T&
+..
+.\" ****************************
+.\" ******** module pic ********
+.\" ****************************
+.\" Pic support.
+.\" PS height width
+.de PS
+.br
+.sp \\n[DD]u
+.ie \\n[.$]<2 .@error bad arguments to PS (not preprocessed with pic?)
+.el \{\
+.	ds@need (u;\\$1)+1v
+.	in +(u;\\n[.l]-\\n[.i]-\\$2/2>?0)
+.\}
+..
+.de PE
+.par@reset
+.sp \\n[DD]u+.5m
+..
+.\" ****************************
+.\" ******** module ref ********
+.\" ****************************
+.\" Refer support.
+.de ]-
+.rm [A [B [C [D [E [G [I [J [N [O [P [Q [R [S [T [V
+.rm ref*string
+..
+.\" Other
+.ds ref*spec!0 Q A T S V N P I C D O
+.\" Journal article
+.ds ref*spec!1 Q A T J S V N P I C D O
+.\" Book
+.ds ref*spec!2 Q A T S V P I C D O
+.\" Article within book
+.ds ref*spec!3 Q A T B E S V P I C D O
+.\" Tech report
+.ds ref*spec!4 Q A T R G P I C D O
+.\" ][ type
+.de ][
+.ie d ref*spec!\\$1 .ref*build \\*[ref*spec!\\$1]
+.el \{\
+.	@error unknown reference type `\\$1'
+.	ref*build \\*[ref*spec!0]
+.\}
+.ref*print
+.rm ref*string
+.rm [F
+..
+.\" start of reference number
+.ds [. \\*[par@sup-start]
+.\" end of reference number
+.ds .] \\*[par@sup-end]
+.\" period before reference
+.ds <. .
+.\" period after reference
+.ds >. \" empty
+.\" comma before reference
+.ds <, ,
+.\" comma after reference
+.ds >, \" empty
+.\" start collected references
+.de ]<
+.als ref*print ref*end-print
+.SH
+\&\\*[REFERENCES]
+.par@reset
+..
+.\" end collected references
+.de ]>
+.par@finish
+.als ref*print ref*normal-print
+..
+.de ref*normal-print
+.ie d [F .FS "\\*([.\\*([F\\*(.]"
+.el .FS \&
+\\*[ref*string]
+.FE
+..
+.de ref*end-print
+.ie d [F .IP "\\*([F."
+.el .XP
+\\*[ref*string]
+..
+.als ref*print ref*normal-print
+.de ref*build
+.rm ref*string ref*post-punct
+.nr ref*suppress-period 1
+.while \\n[.$] \{\
+.	if d [\\$1 \{\
+.		ie d ref*add-\\$1 .ref*add-\\$1
+.		el .ref*add-dflt \\$1
+.	\}
+.	shift
+.\}
+.\" now add a final period
+.ie d ref*string \{\
+.	if !\\n[ref*suppress-period] .as ref*string .
+.	if d ref*post-punct \{\
+.		as ref*string "\\*[ref*post-punct]
+.		rm ref*post-punct
+.	\}
+.\}
+.el .ds ref*string
+..
+.de ref*add-T
+.ref*field T , "\\*Q" "" "\\*U"
+.if r [T .nr ref*suppress-period \\n([T
+..
+.de ref*add-P
+.ie \\n([P>0 .ref*field P , "pp. "
+.el .ref*field P , "p. "
+..
+.de ref*add-J
+.ref*field J , \f2 "" \fP
+..
+.de ref*add-D
+.ref*field D "" ( )
+..
+.de ref*add-E
+.ref*field E , "ed. "
+..
+.de ref*add-G
+.ref*field G "" ( )
+..
+.de ref*add-B
+.ref*field B "" "in \f2" "" \fP
+..
+.de ref*add-O
+.ref*field O .
+.ie r [O .nr ref*suppress-period \\n([O
+.el .nr ref*suppress-period 1
+..
+.de ref*add-A
+.ref*field A ,
+.if r [A .nr ref*suppress-period \\n([A
+..
+.de ref*add-dflt
+.ref*field \\$1 ,
+..
+.\" First argument is the field letter.
+.\" Second argument is the punctuation character to use to separate this field
+.\" from the previous field.
+.\" Third argument is a string with which to prefix this field.
+.\" Fourth argument is a string with which to postfix this field.
+.\" Fifth argument is a string to add after the punctuation character supplied
+.\" by the next field.
+.de ref*field
+.if d ref*string \{\
+.	ie d ref*post-punct \{\
+.		as ref*string "\\$2\\*[ref*post-punct] \"
+.		rm ref*post-punct
+.	\}
+.	el .as ref*string "\\$2 \"
+.\}
+.as ref*string "\\$3\\*([\\$1\\$4
+.if \\n[.$]>4 .ds ref*post-punct "\\$5
+.nr ref*suppress-period 0
+..
+.\" ****************************
+.\" ******** module acc ********
+.\" ****************************
+.\" Accents and special characters.
+.ds Q \)``\)
+.ds U \)''\)
+.ds - \(em
+.\" Characters
+.if !c\(rg .char \(rg (R)
+.if !c\(ah .char \(ah \v'-.55m'\s[\En[.s]/2u]v\s0\v'.55m'
+.if !c\(ad .char \(ad \v'-.55m'\s[\En[.s]*7u/10u].\h'.05m'.\s0\v'.55m'
+.if !c\(a- .char \(a- \v'-.55m'\D'l .25m 0'\v'.55m'
+.if !c\(ao .char \(ao \v'-.55m'\s[\En[.s]*6u/10u]\D'c .25m'\s0\v'.55m'
+.if !c\(ac .char \(ac \s[\En[.s]*8u/10u]\v'.05m',\v'-.05m'\s0
+.if !c\(ho .char \(ho \s[\En[.s]/2u]\v'.4m'c\v'-.4m'\s0
+.if !c\(-D .char \(-D \Z'\v'-.1m'-'D
+.if !c\(Sd .char \(Sd \Z'\v'-.3m'\h'.2m'-'\(pd
+.if !c\(TP .char \(TP I\h'-.25m'\v'-.33m'\s[\En[.s]*6u/10u]\v'.33m'D\
+\v'-.33m'\s0\v'.33m'
+.if !c\(Tp .char \(Tp \zlp
+.if !c\(ss .char \(ss \(*b
+.if !c\(AE .char \(AE A\h'-.3m'E
+.if !c\(ae .char \(ae a\h'-.19m'e
+.if !c\(OE .char \(OE O\h'-.25m'E
+.if !c\(oe .char \(oe o\h'-.14m'e
+.if !c\(r? .char \(r? \Z'\h'.1m'\v'-.15m'\s[\En[.s]*7u/10u]i\s0\v'.15m''\
+\v'.15m'\s[\En[.s]*7u/10u]c\s0\v'-.15m'
+.if !c\(r! .char \(r! \h'.1m'\Z'\v'-.4m'\s[\En[.s]*8u/10u].\s0\v'.4m''\
+\s[\En[.s]*8u/10u]\v'.4m'\(or\v'-.4m'\s0\h'.1m'
+.\" The idea of this definition is for the top of the 3 to be at the x-height.
+.\" A yogh really ought to have a little line going north-west from the top
+.\" left of the 3.
+.if !c\[yogh] .char \[yogh] \Z'\v'\w'x'*0-\En[rst]u'\s[\En[.s]*8u/10u]\
+\v'\w'3'*0+\En[rst]u'3\s0'\h'\w'\s[\En[.s]*8u/10u]3'u'
+.\" Accents
+.de acc*over-def
+.ds \\$1 \Z'\v'(u;\w'x'*0+\En[rst]-\En[.cht])'\
+\h'(u;-\En[skw]+(-\En[.w]-\w'\\$2'/2)+\En[.csk])'\\$2'
+..
+.de acc*under-def
+.ds \\$1 \Z'\v'\En[.cdp]u'\h'(u;-\En[.w]-\w'\\$2'/2)'\\$2'
+..
+.de acc*slash-def
+.ds \\$1 \Z'\h'(u;-\En[.w]-\w'\\$2'/2)'\
+\v'(u;\En[.cdp]-\En[.cht]+\En[rst]+\En[rsb]/2)'\\$2'
+..
+.de acc*prefix-def
+.ds \\$1 \Z'\h'(u;\w'x'-\w'\\$2'/2)'\\$2'
+..
+.acc*prefix-def ' \'
+.acc*prefix-def ` \`
+.acc*prefix-def ^ ^
+.acc*prefix-def , \(ac
+.acc*prefix-def : \(ad
+.acc*prefix-def ~ ~
+.\" improved accent marks
+.de AM
+.acc*over-def ' \'
+.acc*over-def ` \`
+.acc*over-def ^ ^
+.acc*over-def ~ ~
+.acc*over-def : \(ad
+.acc*over-def v \(ah
+.acc*over-def _ \(a-
+.acc*over-def o \(ao
+.acc*under-def , \(ac
+.acc*under-def . \s[\En[.s]*8u/10u]\v'.2m'.\v'-.2m'\s0
+.acc*under-def hook \(ho
+.acc*slash-def / /
+.char \[hooko] o\\\\*[hook]
+.ds q \[hooko]
+.ds 3 \[yogh]
+.ds D- \(-D\"			Icelandic uppercase eth
+.ds d- \(Sd\"			Icelandic lowercase eth
+.ds Th \(TP\"			Icelandic uppercase thorn
+.ds th \(Tp\"			Icelandic lowercase thorn
+.ds 8 \(ss\"			German double s
+.ds Ae \(AE\"			AE ligature
+.ds ae \(ae\"			ae ligature
+.ds Oe \(OE\"			OE ligature
+.ds oe \(oe\"			oe ligature
+.ds ? \(r?\"			upside down ?
+.ds ! \(r!\"			upside down !
+..
+.\" Make sure that no blank lines creep in at the end of this file.
diff --git a/performance/lmbench3/doc/usenix.ol b/performance/lmbench3/doc/usenix.ol
new file mode 100644
index 0000000..e3f2796
--- /dev/null
+++ b/performance/lmbench3/doc/usenix.ol
@@ -0,0 +1,102 @@
+Introduction
+	What is it?
+		A bunch of speed of light benchmarks,
+		not MP, not throughput, not saturation, not stress tests.
+		A microbenchmark suite
+		Measures system performance
+			Latency and bandwidth measurements
+		Measurements focus on OS and hardware
+			What is delivered to the application
+			Not marketing numbers
+		Benchmark performance predicts application performance
+	Results for which systems?
+		Sun, SGI, DEC, IBM, HP, PCs
+	Useful information to whom?
+		Performance engineers, system programmers, system architects.
+Motivation
+	What are we measuring?
+		Control / latecy operatins
+		Bandwidth operations
+	What aren't we measuring?
+		Basic MIPS & MFLOPS.  XXX - not unless I do it right.
+	What can I learn?
+		Cost of operations
+		****Operations per time unit****
+		Compare speed of alternative paths (e.g. mmap vs. read)
+	Performance problems = f(bw issues + latency issues)
+	Give at least two examples
+		NFS control & data: UDP lat, proc lat, & various BW metrics
+		Oracle lock manager: TCP lat
+		Verilog: mem lat
+		AIM: fs ops XXX -ask Scott about pipes.
+	Knowing the speeds of primitives can provide speeds of apps.
+	An example here would be nice.
+Outline
+	Describe benchmark
+		Give results from current machines
+		Discuss results
+	Future changes, enhancements, etc.
+Tutorial on benchmarks
+	For each metric
+		what is it?
+		why is it being measured?
+		How is it measured?
+		Measuring subtlities
+		Interpreting the results
+Latency 
+	Process stuff
+	networking stuff
+	file system stuff
+	memory stuff
+	whatever
+Bandwidth
+	networking
+	file system
+	memory
+Results
+	Tabular results - XXX update that table to reflect the newer metrics
+	Graphs of memory latency & context switches
+	Discussion
+		Memory stuff 
+			Maybe contrast AIX with the $100K IBM
+			uniprocessor w/ killer memory perf and point out
+			that it is the memory that is making AIX go
+			fast, it certainly isn't AIX.  A more politic
+			observation would be that systems with good
+			memory performace tend to have good system
+			performance; the point being to shift people's
+			attention to system performance, especially
+			memory subsystem, as opposed to processor mips.
+		Comparisons
+			Maybe look at the table and draw attention to
+			really good and really bad numbers for various
+			platforms (like Linux' context switch time,
+			Linux fs ops, solaris syscall, process stuff,
+			990 memory BW).
+Graphs
+	A graph showing a range of really fast to really slow ops, all on the
+	same graph.  Do bandwidth stuff normalized on MB/sec.
+	Carl sez: show both ops/sec and cost/op on two graphs.
+	A graph showing processor slow down due to memory misses, assuming 
+	each instruction misses.  Maybe a graph that shows # of clocks
+	(or better yet, # of instructions - think super scalar) that you would
+	have to have between each memory miss in order to run at the clock
+	speed.
+War stories
+	Sun page coloring bug
+	SGI page coloring bug
+	SGI hippi bug - XXX ask Thomas
+	Sun bcopy bug
+Lmbench [optional?]
+	how to get lmbench
+	how to compile
+	how to run
+	how to show results
+Future work
+	More hardware stuff - better latency measurements (write lat, 
+	cache to cache latency). 
+	add throughput & saturation measurements
+TODO
+	get some similar papers for comparison
+	Someday I need reasonable I/O benchmarks to show off good
+	big SMP machines like Challenge.
diff --git a/performance/lmbench3/doc/usenix96.ms b/performance/lmbench3/doc/usenix96.ms
new file mode 100644
index 0000000..ca46fd4
--- /dev/null
+++ b/performance/lmbench3/doc/usenix96.ms
@@ -0,0 +1,1798 @@
+.\" This document is GNU groff -mgs -t -p -R -s
+.\" It will not print with normal troffs, it uses groff features, in particular,
+.\" long names for registers & strings.
+.\" Deal with it and use groff - it makes things portable.
+.\"
+.\" $X$ xroff -mgs -t -p -R -s $file
+.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more
+.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr
+.VARPS
+.\" Define a page top that looks cool
+.\" HELLO CARL!  To turn this off, s/PT/oldPT/
+.de draftPT
+.\" .tl '\fBDRAFT\fP'Printed \\*(DY'\fBDRAFT\fP'
+..
+.de PT
+.if \\n%>1 \{\
+.	sp -.1i
+.	ps 14
+.	ft 3
+.	nr big 24
+.	nr space \\w'XXX'
+.	nr titlewid \\w'\\*[title]'
+.	nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2
+.	ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25'
+.	ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0
+.	ce 1
+\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar]
+.	ps
+.	sp -.70
+.	ps 12
+\\l'\\n[LL]u'
+.	ft
+.	ps
+.\}
+..
+.\" Define a page bottom that looks cool
+.\" HELLO CARL!  To turn this off, s/BT/oldBT/
+.de draftBT
+.\" .tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP'
+..
+.de BT
+.	ps 9
+\v'-1'\\l'\\n(LLu'
+.	sp -1
+.	tl '\(co 1995 \\*[author]'\\*(DY'%'
+.	ps
+..
+.de SP
+.	if t .sp .5
+.	if n .sp 1
+..
+.de BU
+.	SP
+.	ne 2
+\(bu\ 
+.	if \\n[.$] \fB\\$1\fP\\$2
+..
+.nr FIGURE 0
+.nr TABLE 0
+.nr SMALL .25i
+.de TSTART
+.	KF
+.	if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0
+.	ps -1
+.	vs -1
+..
+.de TEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr TABLE \\n[TABLE]+1
+.	ce 1
+\fBTable \\n[TABLE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.de FEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr FIGURE \\n[FIGURE]+1
+.	ce 1
+\fBFigure \\n[FIGURE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.\" Configuration
+.nr PI 3n
+.nr HM .95i
+.nr FM 1i
+.nr PO .95i
+.if t .po .95i
+.nr LL 6.5i
+.if n .nr PO 0i
+.if n .nr LL 7.75i
+.nr PS 10
+.nr VS \n(PS+1
+.ds title Portable tools for performance analysis
+.ds author Larry McVoy
+.ds lmbench \f(CWlmbench\fP
+.ds lmdd  \f(CWlmdd\fP
+.ds bcopy \f(CWbcopy\fP
+.ds connect \f(CWconnect\fP
+.ds execlp  \f(CWexeclp\fP
+.ds exit \f(CWexit\fP
+.ds fork \f(CWfork\fP
+.ds gcc \f(CWgcc\fP
+.ds getpid \f(CWgetpid\fP
+.ds getpid \f(CWgetpid\fP
+.ds gettimeofday \f(CWgettimeofday\fP
+.ds kill \f(CWkill\fP
+.ds memmove \f(CWmemmove\fP
+.ds mmap \f(CWmmap\fP
+.ds popen  \f(CWpopen\fP
+.ds read \f(CWread\fP
+.ds stream \f(CWstream\fP
+.ds system  \f(CWsystem\fP
+.ds uiomove \f(CWuiomove\fP
+.ds write \f(CWwrite\fP
+.ds yield  \f(CWyield\fP
+.\" References stuff
+.de RN  \"Reference Name: .RN $1 -- prints the reference prettily
+.\"[\s-2\\$1\s+2]\\$2
+[\s-1\\$1\s0]\\$2
+..
+.\" .R1
+.\" sort A+DT
+.\" database references
+.\" label-in-text
+.\" label A.nD.y-2
+.\" bracket-label \*([. \*(.] ", "
+.\" .R2
+.TL
+\s(14lmbench: Portable tools for performance analysis\s0\**
+.AU
+\s+2\fR\*[author]\fP\s0
+.AI
+\fI\s+2Silicon Graphics, Inc.\s0\fP
+.AU
+\s+2\fRCarl Staelin\fP
+.AI
+\s+2\fIHewlett-Packard Laboratories\s0\fP
+.SP
+.AB
+\*[lmbench] is a micro-benchmark suite designed to focus
+attention on the basic building blocks of many
+common system applications, such as databases, simulations, 
+software development, and networking.  In almost all
+cases, the individual tests are the result of analysis and isolation
+of a customer's actual performance problem.  
+.\" .SP
+These tools can be, and currently are, used to compare different
+system implementations from different vendors.
+In several cases,
+the benchmarks have uncovered previously unknown bugs and design flaws.
+The results have shown a strong
+correlation between memory system performance and overall performance.
+.\" XXX - MP versus uniprocessors?
+\*[lmbench] includes an extensible database of
+results from systems current as of late 1995.
+.AE
+.if t .MC 3.05i
+.FS
+This paper first appeared in the January 1996 Usenix conference proceedings.
+The version you are reading has new results as well as some corrections.
+.FE
+.NH 1
+Introduction
+.PP
+\*[lmbench]
+provides a suite of benchmarks that attempt to measure the most commonly
+found performance bottlenecks in a wide range of system applications.
+These bottlenecks have been identified, isolated, and reproduced in a set
+of small micro-benchmarks, which measure 
+system latency and bandwidth of data movement among 
+the processor and memory, network, file system, and disk.  
+The intent is to produce numbers that real
+applications can reproduce, rather than the frequently
+quoted and somewhat less reproducible marketing performance numbers.
+.PP
+The benchmarks focus on latency and bandwidth because
+performance issues are usually caused by latency
+problems, bandwidth problems, or some combination of the two.  Each benchmark
+exists because it captures some unique performance problem present in 
+one or more important applications.  
+For example, the TCP latency benchmark is an accurate predictor of the
+Oracle distributed lock manager's performance, the memory latency
+benchmark gives a strong indication of Verilog simulation performance,
+and the file system latency benchmark models a critical path
+in software development.
+.PP
+\*[lmbench] was developed to identify and evaluate system performance
+bottlenecks present in many machines in 1993-1995.  It is entirely
+possible that computer architectures will have changed and advanced
+enough in the next few years to render parts of this benchmark suite
+obsolete or irrelevant.
+.PP
+\*[lmbench] is already in widespread use at many sites by both end
+users and system designers.  In some cases, \*[lmbench] has provided
+the data necessary to discover and correct critical performance
+problems that might have gone unnoticed.  \*[lmbench] uncovered a
+problem in Sun's memory management software
+that made all pages map to the same location in the cache, effectively
+turning a 512 kilobyte (K) cache into a 4K cache.
+.PP
+\*[lmbench] measures only a system's ability
+to transfer data between processor, cache, memory, network, and disk.
+It does not measure other parts of the system, such as the graphics subsystem,
+nor is it a MIPS, MFLOPS,
+throughput, saturation, stress, graphics, or multiprocessor test suite.  
+It is frequently run on multiprocessor (MP) systems to compare their performance
+against
+uniprocessor systems, but it does not take advantage of any multiprocessor
+features.  
+.PP
+The benchmarks are written using standard, portable 
+system interfaces and facilities commonly
+used by applications, so 
+\*[lmbench]
+is portable and comparable over a wide set of Unix systems.
+\*[lmbench] has been run on 
+AIX,
+BSDI,
+HP-UX,
+IRIX,
+Linux,
+FreeBSD,
+NetBSD,
+OSF/1,
+Solaris,
+and
+SunOS.
+Part of the suite has been run on Windows/NT as well.
+.PP
+\*[lmbench]
+is freely distributed under
+the Free Software Foundation's General Public License
+.RN Stallman89 ,
+with the additional restriction 
+that results may be reported only if the benchmarks are unmodified.
+.NH 1
+Prior work
+.PP
+Benchmarking and performance analysis is not a new endeavor.
+There are too many other benchmark suites to list all of
+them here.  We compare \*[lmbench]
+to a set of similar benchmarks.
+.BU "I/O (disk) benchmarks" :
+IOstone
+.RN Park90 
+wants to be an I/O benchmark, but actually measures the memory
+subsystem; all of the tests fit easily in the cache.
+IObench
+.RN Wolman89 
+is a systematic file system and disk benchmark, but it is 
+complicated and unwieldy.
+In
+.RN McVoy91 
+we reviewed many I/O benchmarks and found them all
+lacking because they took too long to run and 
+were too complex a solution to a fairly simple problem.  We wrote a 
+small, simple I/O benchmark, \*[lmdd] that
+measures sequential and random I/O far 
+faster than either IOstone or IObench.  As part of 
+.RN McVoy91
+the results from \*[lmdd] were checked against IObench (as well as some other
+Sun internal I/O benchmarks).   \*[lmdd] proved to be more accurate than any
+of the other benchmarks.
+At least one disk vendor
+routinely uses \*[lmdd] to do performance testing of its disk drives.
+.SP
+Chen and Patterson
+.RN "Chen93, Chen94"
+measure I/O performance under a
+variety of workloads that are automatically varied to test the
+range of the system's performance.  
+Our efforts differ in that we are more interested in the CPU overhead
+of a single request, rather than the capacity of the system as a whole.
+.BU "Berkeley Software Distribution's microbench suite" :
+The BSD effort generated an extensive set of 
+test benchmarks to do regression testing (both quality and performance)
+of the BSD releases.  
+We did not use this as a basis for our work (although we used ideas)
+for the following reasons:
+(a) missing tests \(em such as memory latency,
+(b) too many tests, the results tended to be obscured under a mountain
+of numbers,
+and (c) wrong copyright \(em we wanted the
+Free Software Foundation's General Public License.
+.BU "Ousterhout's Operating System benchmark" :
+.RN Ousterhout90
+proposes several system benchmarks to measure system call
+latency, context switch time, and file system performance.
+We used the same ideas as a basis for our work,  while trying to
+go farther.  We measured a more complete set of
+primitives, including some hardware measurements; went into greater depth
+on some of the tests, such as context switching; and went to great
+lengths to make the benchmark portable and extensible.
+.BU "Networking benchmarks" :
+\f(CWNetperf\fP measures networking bandwidth and latency and 
+was written by Rick Jones of Hewlett-Packard.
+\*[lmbench] includes a smaller,
+less complex benchmark that produces similar results.
+.SP
+\f(CWttcp\fP is a widely used benchmark in the Internet community.  
+Our version of the same benchmark 
+routinely delivers bandwidth numbers that are within 2% of the numbers 
+quoted by \f(CWttcp\fP.
+.BU "McCalpin's stream benchmark" :
+.RN McCalpin95
+has memory bandwidth measurements and results for a large number of
+high-end systems.  
+We did not use these because we discovered them only after
+we had results using our versions.
+We will probably include McCalpin's benchmarks in \*[lmbench]
+in the future.
+.PP
+In summary, we rolled our own because we wanted simple, portable
+benchmarks that accurately measured a wide variety of operations that we
+consider crucial to performance on today's systems.  While portions of
+other benchmark suites include similar work, none includes all of it,
+few are as portable, and almost all are far more complex.  Less filling,
+tastes great.
+.NH 1
+Benchmarking notes
+.NH 2
+Sizing the benchmarks
+.PP
+The proper sizing of various benchmark parameters is crucial to ensure
+that the benchmark is measuring the right component of system performance.  
+For example, memory-to-memory copy
+speeds are dramatically affected by the location of the data: if
+the size parameter is too small so
+the data is in a cache, then the performance may be as much as ten times
+faster than if the data is in memory.
+On the other hand, if the memory size parameter is too big so the data
+is paged to disk, then performance may be slowed to such an extent
+that the benchmark seems to `never finish.'
+.PP
+\*[lmbench] takes the following approach to the cache and memory
+size issues:
+.BU
+All of the benchmarks that could be affected
+by cache size are run in a loop,
+with increasing sizes (typically powers of two) until some maximum size
+is reached.  The results may then be plotted to see where the benchmark
+no longer fits in the cache.
+.BU
+The benchmark verifies that there is sufficient memory to run all of the
+benchmarks in main memory.  A small test program allocates as much memory
+as it can, clears the memory,
+and then strides through that memory a page at a time, timing
+each reference.  If any reference takes more than a few microseconds, the
+page is no longer in memory.  The test program starts small and works forward
+until either enough memory is seen as present or the memory limit is reached.
+.NH 2
+Compile time issues
+.PP
+The GNU C compiler, \*[gcc], is the compiler we chose because 
+it gave the most reproducible results across platforms.  
+When \*[gcc] was not present, we used the vendor-supplied \f(CWcc\fP.
+All of the benchmarks were compiled with optimization \f(CW-O\fP
+except 
+the benchmarks that calculate clock speed and the context switch times,
+which must be compiled without optimization in order to produce
+correct results.  No other optimization flags were enabled because
+we wanted results that would be commonly seen by application writers.
+.PP
+All of the benchmarks were linked using the default manner of
+the target system.  For most if not all systems, the
+binaries were linked using shared libraries.   
+.NH 2
+Multiprocessor issues
+.PP
+All of the multiprocessor systems ran the benchmarks in the same way as
+the uniprocessor systems.  Some systems allow users to pin processes
+to a particular CPU, which sometimes results in better cache reuse.  We
+do not pin processes because it defeats the MP scheduler.
+.\" XXX - I should do this on an IP19 and mark it as pinned.
+In certain cases, this decision yields interesting results discussed later.
+.NH 2
+Timing issues
+.LP
+.sp -.5
+.BU "Clock resolution" :
+The benchmarks measure the elapsed time by reading the system clock via the
+\*[gettimeofday] interface.  On some systems this interface has a resolution
+of 10 milliseconds, a long time relative to many of the benchmarks which
+have results measured in tens to hundreds of microseconds.  To compensate for 
+the coarse clock resolution, the benchmarks are hand-tuned to measure
+many operations within a single time interval lasting for many clock ticks.
+Typically, this is done by executing the operation in a small loop, sometimes
+unrolled if the operation is exceedingly fast, and then dividing 
+the loop time by the loop count.  
+.BU Caching :
+If the benchmark expects the data to be in the cache, the benchmark is 
+typically run several times; only the last result is recorded. 
+.SP
+If the benchmark does not want to measure cache performance it sets
+the size parameter larger than the cache.  For example, the
+\*[bcopy] benchmark by default copies 8 megabytes to 8 megabytes,
+which largely defeats any second-level cache in use today.  (Note that the 
+benchmarks are not trying to defeat the file or process page cache,
+only the hardware caches.)
+.br
+.di bigtable
+.ev keep
+.ps 8
+.vs 9
+.so systems.tbl
+.ps \n[PS]
+.vs \n[VS]
+.nr TABLE \n[TABLE]+1
+.ce 1
+.SP
+\fBTable \n[TABLE].\ \ System descriptions.\fP
+.SP
+.di
+.ev
+.nr WHEN \n[dn]+\n[FM]
+.nr THT \n[dn]
+.de print*table
+'	sp .5
+'	ev keep
+'	nf
+'	bigtable
+.	ne 1
+.	wh -\n[WHEN]u skip*page
+.	fi
+.	ev
+..
+.de skip*page
+'	sp \n[THT]u
+.	wh -\n[WHEN]u
+..
+.wh -\n[WHEN]u print*table
+.BU Variability :
+The results of some benchmarks, most notably the context switch benchmark, had a tendency
+to vary quite a bit, up to 30%.  We suspect that the
+operating system is not using the same set of physical
+pages each time a process is created and we are seeing the effects of
+collisions in the external caches.  We compensate by running the 
+benchmark in a loop and taking the minimum result.  Users interested in
+the most accurate data are advised to verify the results on their
+own platforms.
+.PP
+Many of the results included in the database were donated by users
+and were not created by the authors.
+Good benchmarking practice suggests that one should run the benchmarks
+as the only user of a machine, without other resource intensive
+or unpredictable processes or daemons.
+.NH 2
+Using the \f(CBlmbench\fP database
+.PP
+\*[lmbench] includes a database of results that
+is useful for comparison purposes.  It is quite easy to
+build the source, run the benchmark, and produce a table of results
+that includes the run.  All of the tables in this paper were produced
+from the database included in \*[lmbench].  This paper is also
+included with \*[lmbench] and may be reproduced incorporating new results.
+For more information, consult the file \f(CWlmbench-HOWTO\fP in the 
+\*[lmbench] distribution.
+.NH 1
+Systems tested
+.PP
+\*[lmbench] has been run on a wide variety of platforms. This
+paper includes results from a representative subset of machines and
+operating systems.
+Comparisons between similar hardware running different operating
+systems can be very illuminating, and we have included a few examples
+in our results.
+.PP
+The systems are briefly characterized in Table 1.  Please note that the list prices
+are very approximate as is the year of introduction.
+The SPECInt92 numbers are a little suspect since 
+some vendors have been ``optimizing'' for certain parts of SPEC.  We try and
+quote the original SPECInt92 numbers where we can.
+.NH 2
+Reading the result tables
+.PP
+Throughout the rest of this paper, we present tables of results for many of the
+benchmarks.  All of the tables are sorted, from best to worst.  Some tables
+have multiple columns of results and those tables are sorted on only one of
+the columns.  The sorted column's heading will be in \fBbold\fP.
+.NH 1
+Bandwidth benchmarks
+.PP
+By bandwidth, we mean the rate at which a particular facility can move
+data.  
+We attempt to measure the data movement ability of a number of
+different facilities:
+library \*[bcopy],
+hand-unrolled \*[bcopy],
+direct-memory read and write (no copying),
+pipes,
+TCP sockets,
+the \*[read] interface,
+and
+the \*[mmap] interface.
+.NH 2
+Memory bandwidth
+.PP
+Data movement is fundamental to any operating system.  
+In the past, performance
+was frequently measured in MFLOPS because floating point units were
+slow enough that microprocessor systems were
+rarely limited by memory bandwidth.  Today, floating point units are usually much
+faster than memory bandwidth, so many current MFLOP ratings can not be 
+maintained using memory-resident data; they are ``cache only'' ratings.
+.PP
+We measure the ability to
+copy, read, and write data over a varying set of sizes.
+There are too many results to report all of them here, so we concentrate on 
+large memory transfers.
+.PP
+We measure copy bandwidth two ways.  The first is the user-level library
+\*[bcopy] interface.
+The second is a hand-unrolled loop that loads and stores
+aligned 8-byte words.  
+In both cases, we took care to
+ensure that the source and destination locations would not map to the same
+lines if the any of the caches were direct-mapped.  
+In order to test memory bandwidth rather than cache bandwidth, 
+both benchmarks copy an 8M\** area to another 8M area.  
+(As secondary caches reach 16M, these benchmarks will have to
+be resized to reduce caching effects.)
+.FS
+Some of the PCs had less than 16M of available memory;
+those machines copied 4M. 
+.FE
+.PP
+The copy results actually represent one-half to one-third of the memory
+bandwidth used to obtain those results since we are reading and writing
+memory.  If the cache line size is larger than the word stored, then
+the written cache line will typically be read before it is written.  The
+actual amount of memory bandwidth used varies because some architectures
+have special instructions specifically designed for the \*[bcopy]
+function.  Those architectures will move twice as much memory as
+reported by this benchmark; less advanced architectures move three
+times as much memory: the memory read, the memory read because it is
+about to be overwritten, and the memory written.
+.PP
+The \*[bcopy] results reported in Table 2
+may be correlated with John McCalpin's \*[stream]
+.RN McCalpin95
+benchmark results in the following manner:
+the \*[stream] benchmark reports all of the memory moved
+whereas the \*[bcopy] benchmark reports the bytes copied.  So our
+numbers should be approximately one-half to one-third of his numbers.
+.PP
+Memory reading is measured by an unrolled loop that sums up a series of
+integers.  On most (perhaps all) systems measured the integer
+size is 4 bytes.  The loop is unrolled such that most compilers generate
+code that uses a constant offset with the load, resulting in a load and
+an add for each word of memory.  The add is an integer add that completes
+in one cycle on all of the processors.  Given that today's processor 
+typically cycles at 10 or fewer nanoseconds (ns) and that memory is typically 200-1,000
+ns per cache line, the results reported here should be dominated by the
+memory subsystem, not the processor add unit.
+.PP
+The memory contents are added up because almost all C compilers
+would optimize out the whole loop when optimization was turned on, and
+would generate far too many instructions without optimization.
+The solution is to
+add up the data and pass the result as an unused argument to the 
+``finish timing'' function.  
+.PP
+Memory reads represent about one-third to one-half of the \*[bcopy] work, and we expect
+that pure reads should run at roughly twice the speed of \*[bcopy].
+Exceptions to this rule should be studied, for exceptions indicate a bug
+in the benchmarks, a problem in \*[bcopy], or some unusual hardware.  
+.TSTART
+.so ../Results/tmp/bw_allmem.tbl
+.TEND "Memory bandwidth (MB/s)"
+.PP
+Memory writing is measured by an unrolled loop that stores a value into
+an integer (typically a 4 byte integer) and then increments the pointer.
+The processor cost of each memory operation is approximately the same
+as the cost in the read case.
+.PP
+The numbers reported in Table \n[TABLE]
+are not the raw hardware speed in some cases.
+The Power2\** is capable of up to 800M/sec read rates 
+.FS
+Someone described this machine as a $1,000 processor on a $99,000 memory
+subsystem.
+.FE
+.RN McCalpin95
+and HP PA RISC (and other prefetching)
+systems also do better if higher levels of code optimization used
+and/or the code is hand tuned.
+.PP
+The Sun libc bcopy in Table \n[TABLE] 
+is better because they use a hardware specific bcopy
+routine that uses instructions new in SPARC V9 that were added specifically
+for memory movement.
+.PP
+The Pentium Pro read rate in Table \n[TABLE] is much higher than the write rate because,
+according to Intel, the write transaction turns into a read followed by
+a write to maintain cache consistency for MP systems.
+.NH 2
+IPC bandwidth
+.PP
+Interprocess communication bandwidth is frequently a performance issue.
+Many Unix applications are composed of several processes communicating
+through pipes or TCP sockets.  Examples include the \f(CWgroff\fP documentation
+system that prepared this paper, the \f(CWX Window System\fP, remote file access,
+and \f(CWWorld Wide Web\fP servers.
+.PP
+Unix pipes are an interprocess communication mechanism implemented as 
+a one-way byte stream. Each end of the stream has an associated file
+descriptor; one is the write descriptor and the other the read
+descriptor.
+TCP sockets are similar
+to pipes except they are bidirectional and can cross machine 
+boundaries.
+.PP
+Pipe bandwidth is measured by creating two processes, a writer and a
+reader, which transfer 50M of data in 64K transfers.
+The transfer size was chosen so that the overhead of system calls
+and context switching would not dominate the benchmark time.
+The reader prints the timing results, which guarantees that all
+data has been moved before the timing is finished.
+.PP
+TCP bandwidth is measured similarly, except the data is transferred in
+1M page aligned transfers instead of 64K transfers.  If the TCP
+implementation supports it, the send and receive socket buffers are
+enlarged to 1M, instead of the default 4-60K.  We have found that
+setting the transfer size equal to the socket buffer size produces the
+greatest throughput over the most implementations.
+.TSTART
+.so ../Results/tmp/bw_ipc.tbl
+.TEND "Pipe and local TCP bandwidth (MB/s)"
+.PP
+\*[bcopy] is important to this test because the 
+pipe write/read is typically implemented as a \*[bcopy] into the kernel
+from the writer and then a \*[bcopy] from the kernel to the reader.  
+Ideally, these results would be approximately one-half of the 
+\*[bcopy] results.  It is possible for the kernel \*[bcopy]
+to be faster than the C library \*[bcopy] since the kernel may have 
+access to \*[bcopy] hardware unavailable to the C library.
+.PP
+It is interesting to compare pipes with TCP because the TCP benchmark is
+identical to the pipe benchmark except for the transport mechanism.  
+Ideally, the TCP bandwidth would be as good as the pipe
+bandwidth.  It is not widely known that the
+majority of the TCP cost is in the \*[bcopy], the checksum,
+and the network interface driver. 
+The checksum and the driver may be safely eliminated in the loopback
+case and if the costs have been eliminated, then TCP should be just as
+fast as pipes.  From the pipe and TCP results in Table \n[TABLE], it is easy to
+see that Solaris and HP-UX have done this optimization.
+.PP
+Bcopy rates in Table \n[TABLE] can be lower than pipe rates because the
+pipe transfers are done in 64K buffers, a size that frequently fits in
+caches, while the bcopy is typically an 8M-to-8M copy, which does not
+fit in the cache.
+.PP
+In Table \n[TABLE], the SGI Indigo2, a uniprocessor, does better than
+the SGI MP on pipe bandwidth because of caching effects - in the UP
+case, both processes share the cache; on the MP, each process is
+communicating with a different cache.
+.PP
+All of the TCP results in Table \n[TABLE] are in loopback mode \(em that
+is both ends of the socket are on the same machine.  It was impossible
+to get remote networking results for all the machines included in this
+paper.  We are interested in receiving more results for identical
+machines with a dedicated network connecting them.  The results we have
+for over the wire TCP bandwidth are shown below.
+.TSTART
+.so tcp_bw.tbl
+.TEND "Remote TCP bandwidth (MB/s)"
+.PP
+The SGI using 100MB/s Hippi is by far the fastest in Table \n[TABLE].
+The SGI Hippi interface has hardware support for TCP checksums and 
+the IRIX operating system uses virtual memory tricks to avoid copying 
+data as much as possible.
+For larger transfers, SGI Hippi has reached 92MB/s over TCP.
+.PP
+100baseT is looking quite competitive when compared to FDDI in Table
+\n[TABLE], even though FDDI has packets that are almost three times
+larger.  We wonder how long it will be before we see gigabit ethernet
+interfaces.
+.NH 2
+Cached I/O bandwidth
+.PP
+Experience has shown us that reusing data in the file system
+page cache can be a performance issue.  This 
+section measures that operation through two interfaces, \*[read] and
+\*[mmap].   
+The benchmark here is not an I/O benchmark in that no disk activity is
+involved.
+We wanted to measure the overhead
+of reusing data, an overhead that is CPU intensive, rather than disk intensive.
+.PP
+The \*[read] interface copies data from the kernel's file system page cache into the
+process's buffer, using 64K buffers.  The transfer size was chosen 
+to minimize the kernel entry overhead while
+remaining realistically sized.
+.PP
+The difference between the \*[bcopy] and the \*[read] benchmarks
+is the cost of the file and virtual memory system overhead.  In most
+systems, the \*[bcopy] speed should be faster than the \*[read] speed.  The
+exceptions usually have hardware specifically designed
+for the \*[bcopy] function and that hardware may be available only to
+the operating system.  
+.PP
+The \*[read] benchmark is implemented by rereading a file
+(typically 8M) in 64K
+buffers.  Each buffer is summed as a series of integers in the user
+process.  The summing is done for two reasons: for an apples-to-apples
+comparison the memory-mapped benchmark needs to touch all the data,
+and the file system can sometimes transfer data into memory faster than the
+processor can read the data.
+For example, \s-1SGI\s0's XFS can move data into memory at
+rates in excess of 500M per second, but it can move data into
+the cache at only 68M per second.  The intent is to measure performance
+delivered to the application, not DMA performance to memory.
+.TSTART
+.so ../Results/tmp/bw_reread2.tbl
+.TEND "File vs. memory bandwidth (MB/s)"
+.PP
+The \*[mmap] interface provides a way to access the kernel's file cache 
+without copying the data.  
+The \*[mmap] benchmark is implemented by mapping the entire file (typically 8M)
+into the
+process's address space.  The file is then summed to force the data
+into the cache.
+.PP
+In Table \n[TABLE], 
+a good system will have \fIFile read\fP as fast as (or even faster than)
+\fILibc bcopy\fP because as the file system overhead goes to zero, the
+file reread case is virtually the same as the library \*[bcopy] case.
+However, file reread can be faster because the kernel may have access to
+\*[bcopy] assist hardware not available to the C library.  
+Ideally, \fIFile mmap\fP performance should approach \fIMemory read\fP
+performance, but \*[mmap] is often dramatically worse.  
+Judging by the results, this looks to be a 
+potential area for operating system improvements.
+.PP
+In Table \n[TABLE] the Power2 does better on file reread than bcopy because it takes
+full advantage of the memory subsystem from inside the kernel.  
+The mmap reread is probably slower because of the lower clock rate;
+the page faults start to show up as a significant cost.   
+.PP
+It is surprising that the Sun Ultra1 was able to bcopy at the high
+rates shown in Table 2 but did not show those rates for file reread
+in Table \n[TABLE].
+HP has the opposite problem, they get file reread faster than bcopy,
+perhaps because the kernel \*[bcopy] has access to hardware support.
+.PP
+The Unixware system has outstanding mmap reread rates, better than
+systems of substantially higher cost.  Linux needs to do some work on
+the \f(CWmmap\fP code.
+.NH 1
+Latency measurements
+.PP
+Latency is an often-overlooked
+area of performance problems, possibly because resolving latency issues
+is frequently much harder than resolving bandwidth issues.  For example,
+memory bandwidth may be increased by making wider cache lines and increasing
+memory ``width'' and interleave,
+but memory latency can be improved only by shortening paths or increasing
+(successful) prefetching.  
+The first step toward improving latency is understanding the 
+current latencies in a system.
+.PP
+The latency measurements included in this suite are
+memory latency,
+basic operating system entry cost,
+signal handling cost,
+process creation times,
+context switching,
+interprocess communication,
+.\" virtual memory system latency,
+file system latency, 
+and disk latency.
+.NH 2 
+Memory read latency background
+.PP
+In this section, we expend considerable effort to define the different memory
+latencies and to explain and justify our benchmark.
+The background is a bit tedious but important, since we believe the
+memory
+latency measurements to be one of the most thought-provoking and useful
+measurements in \*[lmbench].  
+.PP
+The most basic latency measurement is memory latency since most of
+the other latency measurements can be expressed in terms of memory
+latency.  For example, context switches require saving the current
+process state and loading the state of the next process.  However, memory
+latency is rarely accurately measured and frequently misunderstood.
+.PP
+Memory read latency has many definitions;
+the most common,
+in increasing time order,
+are memory chip cycle time, processor-pins-to-memory-and-back time,
+load-in-a-vacuum time, and back-to-back-load time.  
+.BU "Memory chip cycle latency" :
+Memory chips are rated in nanoseconds; typical speeds are around 60ns.
+A general overview on DRAM architecture may be found in
+.RN Hennessy96 .
+The 
+specific information we describe here is from 
+.RN Toshiba94  
+and pertains to the \s-1THM361020AS-60\s0 module and \s-1TC514400AJS\s0
+\s-1DRAM\s0 used in \s-1SGI\s0 workstations.  The 60ns time is the
+time from 
+.ps -1
+.nr width \w'R\&A\&S'
+.nr height \n[rst]+1000
+RAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u'
+.ps
+assertion to the when 
+the data will be available on the \s-1DRAM\s0 pins (assuming 
+.ps -1
+.nr width \w'C\&A\&S'
+.nr height \n[rst]+1000
+CAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u'
+.ps
+access time requirements were met).
+While it is possible
+to get data out of a \s-1DRAM\s0 in 60ns, that is not all of
+the time involved.  There is a precharge time that must occur after
+every access.
+.RN Toshiba94
+quotes 110ns as the random read or write cycle time and this
+time is more representative of the cycle time.  
+.\" For example, most systems offer a wide range of memory 
+.\" capacity, from 64MB to 1GB or more.  If 64MB simms are used, the number
+.\" of simms range from 1 to 16.  The more simms there are, the more 
+.\" capacitance there is in the memory subsystem.  More capacitance means
+.\" longer setup times for the fully populated memory subsystem.  System
+.\" designers have to allow time for this setup.
+.\" For more details, consult [XXX - reference on DRAM].
+.\" This is sometimes referred to as the chip latency.  The
+.\" chip cycle time is the chip latency plus the time required to restore
+.\" the data in the capacitors which is often referred to as the precharge
+.\" time.  This means that 60 nanosecond memory chips really are more like
+.\" 100 nanosecond memory chips.  Some systems operate memory in ``page
+.\" mode'' or ``static column'' memory systems hold either RAS or CAS and
+.\" allow subsequent accesses in the same row or column in one cycle instead
+.\" of two.
+.BU "Pin-to-pin latency" :
+This number represents the time needed
+for the memory request to travel from the processor's pins to the memory
+subsystem and back again.  Many vendors have used the pin-to-pin
+definition of memory latency in their reports.  For example, 
+.RN Fenwick95 
+while describing the \s-1DEC\s0 8400
+quotes memory latencies of 265ns; a careful
+reading of that paper shows that these are pin-to-pin numbers.  In spite
+of the historical precedent in vendor reports, this definition of memory
+latency is misleading since it ignores actual delays seen when a load
+instruction is immediately followed by a use of the data being loaded.
+The number of additional cycles inside the processor can be significant
+and grows more significant with today's highly pipelined architectures.
+.PP
+It is worth noting that the pin-to-pin numbers 
+include the amount of time it takes to charge
+the lines going to the \s-1SIMM\s0s, a time that increases with the
+(potential) number of \s-1SIMM\s0s in a system.  More \s-1SIMM\s0s mean
+more capacitance which requires in longer charge times.  This is one reason
+why personal computers frequently have better memory latencies than
+workstations: the PCs typically have less memory capacity.
+.BU "Load-in-a-vacuum latency" :
+A load in a vacuum is the time that the processor will wait for one load that
+must be fetched from main memory (i.e., a cache miss).  The ``vacuum'' 
+means that there is no other activity on the system bus, including no other
+loads.  
+While this number is frequently used as the memory latency, it is not very
+useful.  It is basically a ``not to exceed'' number important only for
+marketing reasons.
+Some architects point out that since most processors implement nonblocking
+loads (the load does not cause a stall until the data is used), the perceived
+load latency may be much less that the real latency.  When pressed, however,
+most will admit that cache misses occur in bursts, resulting in perceived
+latencies of at least the load-in-a-vacuum latency.
+.BU "Back-to-back-load latency" :
+Back-to-back-load latency is the time that each load takes, assuming
+that the instructions before and after are also cache-missing loads.
+Back-to-back loads may take longer than loads in a vacuum for the
+following reason: many systems implement something known as \fIcritical
+word first\fP, which means that the subblock of the cache line that
+contains the word being loaded is delivered to the processor before the
+entire cache line has been brought into the cache.  If another load
+occurs quickly enough after the processor gets restarted from the
+current load, the second load may stall because the cache is still
+busy filling the cache line for the previous load.  On some systems,
+such as the current implementation of UltraSPARC, 
+the difference between back to back and load in a vacuum is about 35%.
+.PP
+\*[lmbench] measures back-to-back-load latency because it is the
+only measurement that may be easily measured from software and
+because we feel that it is what most software developers consider to be memory
+latency.  Consider the following C code fragment:
+.DS
+.nf
+.ft CW
+p = head;
+while (p->p_next)
+	p = p->p_next;
+.ft
+.fi
+.DE
+On a \s-1DEC\s0 Alpha, the loop part turns into three instructions, including the
+load.  A 300 Mhz processor has a 3.33ns cycle time, so the loop
+could execute in slightly less than 10ns.  However, the load itself
+takes 400ns on a 300 Mhz \s-1DEC\s0 8400.  In other words, the 
+instructions cost 10ns but the load stalls for 400.  Another
+way to look at it is that 400/3.3, or 121, nondependent,
+nonloading instructions following the load would be needed 
+to hide the load latency.  
+Because superscalar processors typically execute multiple operations
+per clock cycle, they need even more useful operations between cache
+misses to keep the processor from stalling.
+.PP
+This benchmark illuminates the tradeoffs in processor cache design.
+Architects like large cache lines, up to 64 bytes or so, because 
+the prefetch effect of gathering a whole line increases
+hit rate given reasonable spatial locality.
+Small stride sizes have high spatial locality and should have higher
+performance, but large stride sizes have poor spatial locality causing
+the system to prefetch useless data.
+So the benchmark provides the following insight into negative 
+effects of large line prefetch:
+.BU 
+Multi-cycle fill operations are typically atomic events at the 
+caches, and sometimes block other cache accesses until they
+complete.
+.BU
+Caches are typically single-ported. Having a large line prefetch
+of unused data causes extra bandwidth
+demands at the cache, and can cause increased access latency for 
+normal cache accesses.
+.PP
+In summary, we believe that processors are so fast that the average
+load latency for cache misses will be closer to the
+back-to-back-load number than to the load-in-a-vacuum number.  We are
+hopeful that the industry will standardize on this definition of
+memory latency.
+.NH 2 
+Memory read latency
+.PP
+The entire memory hierarchy can be measured, including on-board data 
+cache latency and size, external data cache latency and size, and 
+main memory latency.
+Instruction caches are not measured.
+TLB miss latency can also be measured, as in 
+.RN Saavedra92 ,
+but we stopped at main memory.  Measuring TLB miss time is problematic
+because different systems map different amounts of memory with their 
+TLB hardware.
+.PP
+The benchmark varies two parameters, array size and array stride.  
+For each size, a list of pointers is created for all of the different 
+strides.  Then the list is walked thus:
+.DS
+.ft CW
+mov  r4,(r4)  # C code: p = *p;
+.ft
+.DE
+The time to do about 1,000,000 loads (the list wraps) is measured and
+reported.  The time reported is pure latency time and may be zero even though
+the load instruction does not execute in zero time.  Zero is defined as one
+clock cycle; in other words, the time reported is \fBonly\fP memory latency
+time, as it does not include the instruction execution time.  It is assumed
+that all processors can do a load instruction in one processor cycle 
+(not counting stalls).  In other words, if the processor cache load time
+is 60ns on a 20ns processor, the load latency reported
+would be 40ns, the additional 20ns is for the load instruction
+itself.\**
+.FS
+In retrospect, this was a bad idea because we calculate the clock
+rate to get the instruction execution time.  If the clock rate is off,
+so is the load time.
+.FE
+Processors that can manage to get the load address out to the 
+address pins before the end of the load cycle get some free time in this
+benchmark (we don't know of any processors that do that).
+.PP
+This benchmark has been validated by logic analyzer measurements
+on an \s-1SGI\s0 Indy by Ron Minnich while he was at the Maryland Supercomputer
+Research Center.
+.TSTART 1
+.so mem.pic
+.FEND "Memory latency" 1
+.PP
+Results from the memory latency benchmark are plotted as a series of data sets
+as shown in Figure \n[FIGURE].
+Each data set represents a stride size,
+with the array size varying from 512 bytes up to 8M or more.
+The curves contain a series of 
+horizontal plateaus, where each plateau represents a level in the
+memory hierarchy.  
+The point where each plateau ends and the line rises marks the
+end of that portion of the memory hierarchy (e.g., external cache).  
+Most machines have similar memory hierarchies:
+on-board cache, external cache, main memory, and main memory plus TLB
+miss costs.  
+There are variations: some processors are missing a cache, while 
+others add another cache to the hierarchy.
+.\" XXX Larry please double-check this; I am going on dim memory...
+For example, the Alpha 8400 has two on-board caches, one 8K
+and the other 96K.
+.PP
+The cache line size can be derived by comparing curves and noticing which
+strides are faster than main memory times.  The smallest stride that is
+the same as main memory speed is likely to be the cache line size because
+the strides that are faster than memory are
+getting more than one hit per cache line.  
+.\" Prefetching may confuse
+.\" the issue because a demand read may stall behind a prefetch load,
+.\" causing cache lines to appear twice as large as they are.
+.\" XXX
+.\" Larry --- can we use prime modulus arithmetic to set up pointer 
+.\" loops which might appear random but which really aren't and which
+.\" hit every stride once before looping?
+.\"
+.\" XXX
+.\" Larry --- is there any way we can defeat/disable prefetching
+.\" so the cache line size can be more accurately determined?
+.\"
+.\" XXX
+.\" Larry --- can we create a benchmark for TLB misses?
+.\" I think it was Tom Rokicki who suggested that we create a
+.\" benchmark where the data fits in the cache, but the pages don't
+.\" fit in the TLB.  
+.\"
+.\" XXX
+.\" Larry --- is the description of the memory hierarchy correct?
+.\" I am not sure I haven't added an extra level of external cache...
+.EQ
+delim $$
+.EN
+.PP
+Figure \n[FIGURE] shows memory latencies on a nicely made machine,
+a \s-1DEC\s0 Alpha.
+We use this machine as the example 
+because it shows the latencies and sizes of
+the on-chip level 1 and motherboard level 2 caches, and because it
+has good all-around numbers, especially considering it can support a
+4M level 2 cache.
+The on-board cache is $2 sup 13$ bytes or 8K, while the
+external cache is $2 sup 19$ bytes or 512K.
+.EQ
+delim off
+.EN
+.TSTART
+.so lat_allmem.tbl
+.TEND "Cache and memory latency (ns)"
+.nr MEMTABLE \n[TABLE]
+.PP
+Table \n[TABLE] shows the cache size, cache latency, and main memory
+latency as extracted from the memory latency graphs.  
+The graphs and the tools for extracting the data are 
+included with \*[lmbench].  
+It is worthwhile to plot all of the graphs and examine them since the
+table is missing some details, such as the
+\s-1DEC\s0 Alpha 8400 processor's second 96K on-chip cache.
+.PP
+We sorted Table \n[TABLE] on level 2 cache latency because we think
+that many applications will fit in the level 2 cache.  The HP and IBM
+systems have only one level of cache so we count that as both level 1
+and level 2.  Those two systems have remarkable cache performance for
+caches of that size.  In both cases, the cache delivers data in one
+clock cycle after the load instruction.  
+.PP
+HP systems usually focus on
+large caches as close as possible to the processor.  A older HP
+multiprocessor system, the 9000/890, has a 4M, split I&D, direct mapped
+cache with a 2K victim cache, accessible in one clock (16ns).\**  That system is
+primarily a database server.  
+.FS
+The Usenix version of this paper had this as a set associate cache; that was
+incorrect.
+.FE
+.PP
+The IBM focus is on low latency, high
+bandwidth memory.  The IBM memory subsystem is good because all of
+memory is close to the processor, but has the weakness that it is
+extremely difficult to evolve the design to a multiprocessor system.
+.PP 
+The 586 and PowerPC motherboards have quite poor second level caches,  
+the caches are not substantially better than main memory.
+.PP
+The Pentium Pro and Sun Ultra second level caches are of medium speed 
+at 5-6 clocks latency each.  5-6 clocks seems fast until it is compared
+against the HP and IBM one cycle latency caches of similar size.  
+Given the tight integration of the Pentium Pro level 2 cache, it is 
+surprising that it has such high latencies.
+.PP
+The 300Mhz DEC Alpha has a rather high 22 clock latency to the second
+level cache which is probably one of the reasons that they needed a 96K
+level 1.5 cache.  SGI and DEC have used large second level caches
+to hide their long latency from main memory.
+.PP
+.NH 2
+Operating system entry
+.PP
+Entry into the operating system is required for many system facilities.  
+When calculating the cost of a facility, it is useful to know how
+expensive it is to perform a nontrivial entry into the operating system.
+.PP
+We measure nontrivial entry into the system by repeatedly writing one
+word to \f(CW/dev/null\fP, a pseudo device driver that does nothing but
+discard the data.   This particular entry point was chosen because it has
+never been optimized in any system that we have measured.  Other entry
+points, typically \*[getpid] and \*[gettimeofday], are heavily used,
+heavily optimized, and sometimes implemented as user-level library
+routines rather than system calls.  
+A write to the \f(CW/dev/null\fP driver will go
+through the system call table to \*[write], verify the user area as
+readable, look up the file descriptor to get the vnode, call the vnode's
+write function, and then return.  
+.TSTART
+.so ../Results/tmp/lat_nullsys.tbl
+.TEND "Simple system call time (microseconds)"
+.PP
+Linux is the clear winner in the system call time.  The reasons are
+twofold: Linux is a uniprocessor operating system, without any
+MP overhead, and Linux is a small operating system, without all
+of the ``features'' accumulated by the commercial offers.
+.PP
+Unixware and Solaris are doing quite well, given that they are both fairly
+large, commercially oriented operating systems with a large accumulation
+of ``features.''
+.NH 2
+Signal handling cost
+.PP
+Signals in Unix are a way to tell another process to handle an event.  They
+are to processes as interrupts are to the CPU.
+.PP
+Signal handling is often critical to layered systems.  Some applications,
+such as databases, software development environments, and threading libraries
+provide an operating system-like layer on top of the operating system,
+making signal handling a critical path in many of these applications.
+.PP
+\*[lmbench] measure both signal installation and signal dispatching in two separate
+loops, within the context of one process.
+It measures signal handling by installing a signal handler and then repeatedly
+sending itself the signal.   
+.TSTART
+.so ../Results/tmp/lat_signal.tbl
+.TEND "Signal times (microseconds)"
+.PP
+Table \n[TABLE] shows the signal handling costs.  
+Note that there are no context switches in this benchmark; the signal goes
+to the same process that generated the signal.  In real applications,
+the signals usually go to another process, which implies
+that the true cost of sending that signal is the signal overhead plus the
+context switch overhead.  We wanted to measure signal and context
+switch overheads separately since context
+switch times vary widely among operating systems.
+.PP
+SGI does very well on signal processing,
+especially since their hardware is of an older generation than
+many of the others.  
+.PP
+The Linux/Alpha signal handling numbers are so poor
+that we suspect that this is a bug, especially given that the Linux/x86
+numbers are quite reasonable.
+.NH 2
+Process creation costs
+.PP
+Process benchmarks are used to measure the basic process primitives,
+such as creating a new process, running a different program, and context
+switching.  Process creation benchmarks are of particular interest
+in distributed systems since many remote operations include the creation
+of a remote process to shepherd the remote operation to completion.
+Context switching is important for the same reasons.
+.BU "Simple process creation" .
+The Unix process creation primitive is \*[fork], which
+creates a (virtually) exact copy of the calling process.
+Unlike VMS and some other operating systems, Unix starts any new process
+with a \*[fork].  
+Consequently, \*[fork] and/or \f(CWexecve\fP should be fast and
+``light,'' facts that many have been ignoring for some time.
+.PP
+\*[lmbench] measures simple process creation by creating a process 
+and immediately
+exiting the child process.  The parent process waits for the child
+process to exit.   
+The benchmark is intended to measure the overhead for creating a 
+new thread of control, so it includes the \*[fork] and
+the \*[exit] time.
+.PP
+The benchmark also includes a \f(CWwait\fP system call in the parent and
+context switches from the parent to the child and back again.   Given that
+context switches of this sort are on the order of 20 microseconds and a 
+system call is on the order of 5 microseconds, and that the entire benchmark
+time is on the order of a millisecond or more, the extra overhead
+is insignificant.
+Note that even this relatively simple task is very expensive and is
+measured in milliseconds while most of the other operations we consider are
+measured in microseconds.  
+.BU "New process creation" .
+The preceding benchmark did not create a new application; it created a
+copy of the old application.   This benchmark measures the cost of creating a
+new process and changing that process into a new application, which.  
+forms the basis of every Unix command
+line interface, or shell.  
+\*[lmbench] measures this facility by forking a new child and having that child
+execute a new program \(em in this case, a tiny program that prints 
+``hello world'' and exits.
+.PP
+The startup cost is especially noticeable
+on (some) systems that have shared libraries.  Shared libraries can
+introduce a substantial (tens of milliseconds) startup cost.  
+.\" XXX - statically linked example?
+.TSTART
+.so ../Results/tmp/lat_allproc.tbl
+.TEND "Process creation time (milliseconds)"
+.BU "Complicated new process creation" .
+When programs start other programs, they frequently use one of
+three standard interfaces: \*[popen], \*[system], and/or \*[execlp].  The first
+two interfaces start a new process by invoking the standard command
+interpreter, \f(CW/bin/sh\fP, to start the process.  Starting programs this way
+guarantees that the shell will look for the requested application
+in all of the places that the user would look \(em in other words, the shell
+uses the user's $PATH variable as a list of places to find the
+application.  \*[execlp] is a C library routine which also looks for the
+program using the user's $PATH variable.
+.PP
+Since this is a common way of starting applications, we felt it
+was useful to show the costs of the generality.
+.PP
+We measure this by starting \f(CW/bin/sh\fP to start the same tiny 
+program we ran in the last case.
+In Table \n[TABLE] the cost of asking the shell to go 
+look for the program is
+quite large, frequently ten times as expensive as just creating a 
+new process, and four times as expensive as explicitly naming the location
+of the new program.
+.PP
+The results that stand out in Table \n[TABLE] are the poor Sun Ultra 1 results.
+Given that the processor is one of the fastest, the problem is likely to be
+software.  There is room for substantial improvement in the Solaris
+process creation code.
+.NH 2
+Context switching
+.PP
+Context switch time is defined here as 
+the time needed to save the state of one process and restore the state
+of another process.  
+.PP
+Context switches are frequently in the critical performance path of 
+distributed applications.  For example, the multiprocessor versions
+of the IRIX operating system use
+processes to move data through the networking stack.  This means that the
+processing time for each new packet arriving at an idle system includes
+the time needed to switch in the networking process.  
+.PP
+Typical context switch benchmarks measure just the minimal context switch
+time \(em the time to switch between two processes that are doing nothing
+but context switching.  We feel that this is
+misleading because there are frequently more than two active processes,
+and they usually have a larger working set (cache footprint)
+than the benchmark processes.
+.PP
+Other benchmarks frequently include the cost of 
+the system calls needed to force the context switches.  
+For example, Ousterhout's context switch benchmark 
+measures context switch time plus a \*[read] and a \*[write]
+on a pipe.  
+In many of the systems measured by \*[lmbench], the pipe overhead 
+varies between 30% and 300% of the context switch time, so we were 
+careful to factor out the pipe overhead.
+.BU "Number of processes."
+The context switch benchmark is implemented as 
+a ring of two to twenty processes that are connected with Unix pipes.  
+A token is passed from process to process, forcing context switches.  
+The benchmark measures the time needed to pass
+the token two thousand times from process to process.  
+Each transfer of the token has two costs: the context switch, and
+the overhead of passing the token.
+In order to calculate just the context switching time, the benchmark first
+measures the cost of passing the token through a ring of pipes in a
+single process.  This overhead time is defined as the cost of passing
+the token and is not included in the reported context switch time.
+.BU "Size of processes."
+In order to measure more realistic context switch times, we add
+an artificial variable size ``cache footprint'' to the switching
+processes.  The cost of the context switch then includes the cost
+of restoring user-level state (cache footprint).  The cache footprint
+is implemented by having the process allocate an array of data\**
+.FS
+All arrays are at the same virtual
+address in all processes.
+.FE
+and sum
+the array as a series of integers after receiving the token but before
+passing the token to the next process.  Since most systems will cache data
+across context switches, the working set for the benchmark is slightly 
+larger than the number of processes times the array size.  
+.PP
+It is worthwhile to point out that the overhead mentioned above
+also includes the cost of accessing the data, in the same way as
+the actual benchmark.   However, because the overhead is measured
+in a single process, the cost is typically the cost with ``hot''
+caches.  In the Figure 2, each size is plotted as a line, with
+context switch times on the Y axis, number of processes on the 
+X axis, and the process size as the data set.
+The process size and the hot cache overhead costs for
+the pipe read/writes and any data access is what is labeled
+as \f(CWsize=0KB overhead=10\fP.  The size is in kilobytes and the overhead
+is in microseconds.
+.PP
+The context switch time does not include anything other than
+the context switch, provided that all the benchmark processes fit in the
+cache.  If the total size of all of the benchmark processes is larger
+than the cache size,  the cost of each context switch will include cache
+misses.
+We are trying to show realistic context switch times as a
+function of both size and number of processes.
+.TSTART 1
+.so ctx.pic
+.FEND "Context switch times" 1
+.PP
+Results for an Intel Pentium Pro system running Linux at 167 MHz are
+shown in Figure \n[FIGURE].
+The data points on the figure are labeled with the working set
+due to the sum of data in all of the processes.  The actual working set is
+larger, as it includes the process and kernel overhead as well.  
+One would expect the context switch times to stay constant until 
+the working set is
+approximately the size of the second level cache.  The Intel system has a 
+256K second level cache, and the context switch times 
+stay almost constant until about 256K (marked as .25M in the graph).
+.BU "Cache issues"
+The context switch benchmark is a deliberate measurement of the
+effectiveness of the caches across process context switches.  If the
+cache does not include the process identifier (PID, also sometimes
+called an address space identifier) as part of the address, then the
+cache must be flushed on every context switch.  If the cache does not map
+the same virtual addresses from different processes to different cache
+lines, then the cache will appear to be flushed on every context
+switch.
+.PP
+If the caches do
+not cache across context switches there would be no grouping at the
+lower left corner of Figure \n[FIGURE], instead, the graph would
+appear as a series of straight, horizontal, parallel lines.  The number
+of processes will not matter, the two process case will be just as bad
+as the twenty process case since the cache would not be
+useful across context switches.
+.TSTART
+.so ../Results/tmp/ctx.tbl
+.TEND "Context switch time (microseconds)"
+.PP
+We picked four points on the graph and extracted those values for Table
+\n[TABLE].  The complete set of values, as well as tools to graph them,
+are included with \*[lmbench].
+.PP
+Note that multiprocessor context switch times are frequently more expensive
+than uniprocessor context switch times.  This is because multiprocessor
+operating systems tend to have very complicated scheduling code. 
+We believe that multiprocessor context switch times can be, and should be,
+within 10% of the uniprocessor times.
+.PP
+Linux does quite well on context switching, especially on the more
+recent architectures.  By comparing the Linux 2 0K processes to the
+Linux 2 32K processes, it is apparent that there is something wrong
+with the Linux/i586 case.  If we look back to Table \n[MEMTABLE], we can
+find at least part of the cause.  The second level cache latency for the
+i586 is substantially worse than either the i686 or the Alpha.  
+.PP
+Given the poor second level cache behavior of the PowerPC, it is surprising
+that it does so well on context switches, especially the larger sized cases.
+.PP
+The Sun Ultra1 context switches quite well in part because of enhancements
+to the register window handling in SPARC V9.
+.NH 2
+Interprocess communication latencies
+.PP
+Interprocess communication latency is important because many operations
+are control messages to another process (frequently on another
+system).  The time to tell the remote process to
+do something is pure overhead and is frequently in the critical path
+of important functions such as distributed applications (e.g.,
+databases, network servers).
+.PP
+The interprocess communication latency benchmarks typically have the 
+following form: pass a small message (a byte or so) back and forth between two
+processes.  The reported results are always the microseconds needed
+to do one round trip.  For one way timing, 
+about half the round trip is right.  However, the CPU cycles tend to be
+somewhat asymmetric for one trip: receiving is typically more
+expensive than sending.  
+.BU "Pipe latency" .
+Unix pipes are an interprocess communication mechanism implemented as 
+a one-way byte stream.  Each end of the stream has an associated file
+descriptor; one is the write descriptor and the other the read
+descriptor.
+.PP
+Pipes are frequently used as a local IPC mechanism.  Because of the 
+simplicity of pipes, they are frequently the fastest portable 
+communication mechanism.
+.PP
+Pipe latency is measured by creating a pair of pipes, forking a child process,
+and passing a word back and forth.  This benchmark is identical to the 
+two-process, zero-sized context switch benchmark, except that it includes
+both the context switching time and the pipe overhead in the results.
+.nr NTABLE \n[TABLE]+1
+.nr LTABLE \n[TABLE]
+Table \n[NTABLE] shows the round trip latency from process A to process B
+and back to process A.
+.TSTART
+.so ../Results/tmp/lat_pipe.tbl
+.TEND "Pipe latency (microseconds)"
+.PP
+The time can be broken down to two context switches plus four system calls
+plus the pipe overhead.  The context switch component is two of the small
+processes in Table \n[LTABLE].
+This benchmark is identical to the context switch benchmark in
+.RN Ousterhout90 .
+.BU "TCP and RPC/TCP latency" .
+TCP sockets may be viewed as an interprocess communication mechanism similar
+to pipes with the added feature that TCP sockets work across machine 
+boundaries.
+.PP
+TCP and RPC/TCP connections are frequently used in low-bandwidth, 
+latency-sensitive applications.  The default Oracle distributed 
+lock manager uses TCP sockets, and the locks per second available 
+from this service are accurately modeled by the TCP latency test.
+.TSTART
+.so ../Results/tmp/lat_tcp.tbl
+.TEND "TCP latency (microseconds)"
+.PP
+Sun's RPC is layered either over TCP or over UDP.
+The RPC layer is responsible for managing connections (the port mapper), 
+managing different byte orders and word sizes (XDR), and implementing a 
+remote procedure call abstraction.
+Table \n[TABLE] shows the same benchmark with and
+without the RPC layer to show the cost of the RPC implementation.
+.PP
+TCP latency is measured by having a server process that waits for connections
+and a client process that connects to the server.  The two processes then
+exchange a word between them in a loop.  The latency reported is one 
+round-trip time.  The measurements in Table \n[TABLE] are local 
+or loopback measurements, 
+since our intent is to show the overhead of the software.  The same benchmark
+may be, and frequently is, used to measure host-to-host latency.
+.PP
+Note that the RPC layer frequently adds hundreds of microseconds of
+additional latency.  The problem is not the external data
+representation (XDR) layer \(em the
+data being passed back and forth is a byte, so there is no XDR to be done.
+There is no justification for the extra cost; it is simply
+an expensive implementation.  DCE RPC is worse.
+.TSTART
+.so ../Results/tmp/lat_udp.tbl
+.TEND "UDP latency (microseconds)"
+.BU "UDP and RPC/UDP latency" .
+UDP sockets are an alternative to TCP sockets.  They differ in that UDP
+sockets are unreliable messages that leave the retransmission issues to
+the application.  UDP sockets have a few advantages, however.  They preserve
+message boundaries, whereas TCP does not; and a single UDP socket may 
+send messages
+to any number of other sockets, whereas TCP sends data to only one place.
+.PP
+UDP and RPC/UDP messages are commonly used in many client/server applications.
+NFS is probably the most widely used RPC/UDP application in the world.
+.PP
+Like TCP latency, UDP latency is measured by having a server process 
+that waits for connections
+and a client process that connects to the server.  The two processes then
+exchange a word between them in a loop.  The latency reported is round-trip
+time.  The measurements in Table \n[TABLE] are local or loopback measurements,
+since our intent is to show the overhead of the software.
+Again, note that the RPC library can add hundreds of microseconds of extra
+latency.  
+.\" .PP
+.\" It is interesting to compare UDP latency with TCP latency.  In many cases the
+.\" TCP latency is \fBless\fP than the UDP latency.  This flies in the face
+.\" of conventional wisdom, which says that TCP is an inherently more expensive
+.\" protocol than UDP.  The reasons that TCP may appear faster are: in this
+.\" benchmark, the protocol costs are dwarfed by the other costs (context
+.\" switching, system calls, and driver overhead); and TCP is frequently
+.\" hand-tuned for performance, while UDP is rarely hand-tuned.
+.TSTART
+.so ipc.tbl
+.TEND "Remote latencies (microseconds)"
+.BU "Network latency" .
+We have a few results for over the wire latency included in Table \n[TABLE].
+As might be expected, the most heavily used network interfaces (i.e., ethernet)
+have the lowest latencies.  The times shown include the time on the wire,
+which is about 130 microseconds for 10Mbit ethernet, 13 microseconds for 100Mbit
+ethernet and FDDI, and less than 10 microseconds for Hippi.
+.BU "TCP connection latency" .
+TCP is a connection-based, reliable, byte-stream-oriented protocol.  As
+part of this reliability, a connection must be established before any
+data can be transferred.  The connection is accomplished by a ``three-way
+handshake,'' an exchange of packets when the client attempts to connect
+to the server.
+.PP
+Unlike UDP, where no connection is established, TCP sends packets
+at startup time.  If an application creates a TCP connection to send
+one message, then the startup time can be a substantial
+fraction of the total connection and transfer costs.   
+The benchmark shows that the connection cost is approximately half of
+the cost.  
+.PP
+Connection cost is measured by having a server, registered using
+the port mapper, waiting for connections.  The client figures out where the
+server is registered and then repeatedly times a \*[connect] system call to
+the server.  The socket is closed after each connect.  Twenty connects
+are completed and the fastest of them is used as the result.  The time measured
+will include two of the three packets that make up the three way TCP handshake,
+so the cost is actually greater than the times listed.
+.\" XXX Larry --- if a machine's clock granularity is on the order of
+.\" 10 milliseconds, won't this benchmark run into granularity problems?
+.TSTART
+.so ../Results/tmp/lat_connect.tbl
+.TEND "TCP connect latency (microseconds)"
+.PP
+Table \n[TABLE] shows that if the need is to send
+a quick message to another process, given that most packets get through,
+a UDP message will cost a \f(CWsend\fP and a \f(CWreply\fP (if positive
+acknowledgments are needed, which they are in order to have an apples-to-apples 
+comparison with TCP).  If the transmission medium is 10Mbit Ethernet, the
+time on the wire will be approximately 65 microseconds each way, or 130
+microseconds total.  To do the same thing with a short-lived TCP 
+connection would cost 896 microseconds of wire time alone.
+.PP
+The comparison is not meant to disparage TCP; TCP is a useful protocol.  Nor
+is the point to suggest that all messages should be UDP.  In many cases,
+the difference between 130 microseconds and 900 microseconds is
+insignificant compared with other aspects of application performance.
+However, if the application is very latency sensitive
+and the transmission medium is slow (such as serial link or a message
+through many routers), then a UDP message may prove cheaper.
+.NH 2 
+File system latency
+.PP
+File system latency is defined as the time required to create or delete
+a zero length file.  
+We define it this way because in many file systems,
+such as the BSD fast file system, the directory operations are done
+synchronously in order to maintain on-disk integrity.  Since the 
+file data is typically cached and sent to disk at some later date,
+the file creation and deletion become the bottleneck
+seen by an application.  This bottleneck is substantial: to do
+a synchronous update to a disk is a matter of tens of milliseconds.
+In many cases, this bottleneck is much more of a perceived performance
+issue than processor speed.
+.PP
+The benchmark creates 1,000 zero-sized files and then deletes them.
+All the files are created in one directory and their names are
+short, such as "a", "b", "c", ... "aa", "ab", ....
+.TSTART
+.so lat_fs.tbl
+.TEND "File system latency (microseconds)"
+.PP
+The create and delete latencies are shown in Table \n[TABLE].
+Notice that Linux does extremely well here, 2 to 3 orders of magnitude faster
+than the slowest systems.  However, Linux does not guarantee
+anything about the disk integrity; the directory operations are done in
+memory.  Other fast systems, such as SGI's XFS, use a log to guarantee the 
+file system integrity.
+The slower systems, all those with ~10 millisecond file latencies, are
+using synchronous writes to guarantee the file system integrity.
+Unless Unixware has modified UFS substantially, they must be running in
+an unsafe mode since the FreeBSD UFS is much slower and both file
+systems are basically the 4BSD fast file system.
+.NH 2
+Disk latency
+.\" XXX - either get more results for this benchmark or delete it.
+.\" I'd really like to not delete it - lmdd is probably the most
+.\" useful tool and it gets the least press.
+.PP
+Included with \*[lmbench] is a small benchmarking program useful for
+measuring disk and file I/O.  \*[lmdd], which is patterned after 
+the Unix utility \f(CWdd\fP, measures both sequential and random I/O,
+optionally generates patterns on output and checks them on input, 
+supports flushing the data from the buffer cache on systems that 
+support \f(CWmsync\fP, and has a very flexible user interface.  
+Many I/O benchmarks can be trivially replaced with a \f(CWperl\fP script
+wrapped around \*[lmdd].  
+.PP
+While we could have generated both sequential and random I/O results as
+part of this paper, we did not because those benchmarks are heavily
+influenced by the performance of the disk drives used in the test.  We
+intentionally measure only the system overhead of a SCSI command since
+that overhead may become a bottleneck in large database configurations.
+.PP
+Some important applications, such as transaction processing, are
+limited by random disk IO latency.  
+Administrators can increase the number of disk operations per
+second by buying more disks, until the processor overhead becomes
+the bottleneck.
+The \f(CWlmdd\fP  benchmark measures the processor overhead associated with each
+disk operation, and it can provide an upper bound on the number of
+disk operations the processor can support.
+It is designed for SCSI disks, and it assumes that most
+disks have 32-128K read-ahead buffers and that they can read ahead
+faster than the processor can request the chunks of data.\**
+.FS
+This may not always be true: a processor could be fast enough to make the
+requests faster than the rotating disk.  
+If we take 6M/second to be disk
+speed, and divide that by 512 (the minimum transfer size), that is 12,288 IOs/second, or
+81 microseconds/IO.  We don't know of any processor/OS/IO controller
+combinations that can do an IO in 81 microseconds.
+.FE
+.PP
+The benchmark simulates a large number of disks by reading 512byte
+transfers sequentially from the raw disk device (raw disks are unbuffered
+and are not read ahead by Unix).  
+Since the disk can read ahead faster than the system can request
+data, the benchmark is doing small transfers of data from the
+disk's track buffer.  
+Another way to look at this is that the benchmark 
+is doing memory-to-memory transfers across a SCSI channel.
+It is possible to generate loads of more than 1,000 SCSI
+operations/second on a single SCSI disk.  For comparison, disks under
+database load typically run at 20-80 operations per second.
+.TSTART
+.so ../Results/tmp/lat_disk.tbl
+.TEND "SCSI I/O overhead (microseconds)"
+.PP
+The resulting overhead number represents a 
+\fBlower\fP bound on the overhead of a disk I/O.  
+The real overhead numbers will be higher on SCSI systems because
+most SCSI controllers will not disconnect if the request can be
+satisfied immediately.
+During the benchmark, the processor simply sends the request and
+transfers the data, while 
+during normal operation, the processor will send the request,
+disconnect, get interrupted, reconnect, and transfer the data.
+.PP
+This technique can be used to discover how many drives a system can support
+before the system becomes CPU-limited because it can produce the
+overhead load of a fully configured system with just a few disks.
+.NH 1
+Future work
+.PP
+There are several known improvements and extensions that could be made
+to \*[lmbench].
+.BU "Memory latency" .
+The current benchmark measures clean-read latency.  By clean, we mean that 
+the cache lines being replaced are highly likely to be unmodified, so there
+is no associated write-back cost.  We would like to extend the benchmark
+to measure dirty-read latency, as well as write latency.  Other changes 
+include making the benchmark impervious to sequential prefetching and
+measuring TLB miss cost.
+.BU "MP benchmarks" .
+None of the benchmarks in \*[lmbench] is designed to measure any
+multiprocessor features directly.  At a minimum, we could measure 
+cache-to-cache latency as well as cache-to-cache bandwidth.
+.BU "Static vs. dynamic processes" .
+In the process creation section, we allude to the cost of starting up processes
+that use shared libraries.  When we figure out how to create statically linked
+processes on all or most systems, we could quantify these costs exactly.
+.BU "McCalpin's stream benchmark" .
+We will probably incorporate part or all of this benchmark into \*[lmbench].
+.BU "Automatic sizing" .
+We have enough technology that we could determine the size of the external
+cache and autosize the memory used such that the external cache had no effect.
+.BU "More detailed papers" .
+There are several areas that could yield some interesting papers.  The
+memory latency section could use an in-depth  treatment, and the
+context switching section could turn into an interesting discussion of
+caching technology.
+.NH 1
+Conclusion
+.PP
+\*[lmbench] is a useful, portable micro-benchmark suite designed to
+measure important aspects of system performance.   We have found that a good
+memory subsystem is at least as important as the processor speed.
+As processors get faster and faster, more and more of the system design
+effort will need to move to the cache and memory subsystems.
+.NH 1
+Acknowledgments
+.PP
+Many people have provided invaluable help and insight into both the
+benchmarks themselves and the paper.  The \s-1USENIX\s0 reviewers
+were especially helpful.
+We thank all of them
+and especially thank:
+Ken Okin \s-1(SUN)\s0,
+Kevin Normoyle \s-1(SUN)\s0,
+Satya Nishtala \s-1(SUN)\s0,
+Greg Chesson \s-1(SGI)\s0,
+John Mashey \s-1(SGI)\s0,
+Neal Nuckolls \s-1(SGI)\s0,
+John McCalpin \s-1(Univ. of Delaware)\s0,
+Ron Minnich \s-1(Sarnoff)\s0,
+Chris Ruemmler \s-1(HP)\s0,
+Tom Rokicki \s-1(HP)\s0,
+and 
+John Weitz \s-1(Digidesign)\s0.
+.PP
+We would also like to thank all of the people that have run the
+benchmark and contributed their results; none of this would have been possible
+without their assistance.
+.PP
+Our thanks to 
+all of the free software community for tools that were used during this
+project.
+\*[lmbench] is currently developed on Linux, a copylefted Unix written by 
+Linus Torvalds and his band of happy hackers.
+This paper and all of the 
+\*[lmbench] documentation was produced using
+the \f(CWgroff\fP suite of tools written by James Clark.
+Finally, all of the data processing of the results is done with
+\f(CWperl\fP written by Larry Wall.  
+.PP
+Sun Microsystems, and in particular Paul Borrill,
+supported the initial development of this project.  Silicon Graphics
+has supported ongoing development that turned into far more time then we 
+ever imagined.  We are grateful to both of these companies for their
+financial support.
+.NH 1
+Obtaining the benchmarks
+.PP
+The benchmarks are available at
+.ft I
+http://reality.sgi.com/employees/lm_engr/lmbench.tgz
+.ft
+as well as via a mail server.
+You may request the latest version of \*[lmbench] by sending email 
+to \fIarchives@xxxxxxxxxxxxxxxxxxx\fP with \fIlmbench-current*\fP 
+as the subject.
+.\" .R1
+.\" bibliography references
+.\" .R2
+.\"********************************************************************
+.\" Redefine the IP paragraph format so it won't insert a useless line
+.\" break when the paragraph tag is longer than the indent distance
+.\"
+.de @IP
+.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2)
+.par*start \\n[\\n[.ev]:ai] 0
+.if !'\\$1'' \{\
+.	\" Divert the label so as to freeze any spaces.
+.	di par*label
+.	in 0
+.	nf
+\&\\$1
+.	di
+.	in
+.	fi
+.	chop par*label
+.	ti -\\n[\\n[.ev]:ai]u
+.	ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c
+.	el \{\
+\\*[par*label]
+.\".	br
+.	\}
+.	rm par*label
+.\}
+..
+.\"********************************************************************
+.\" redefine the way the reference tag is printed so it is enclosed in
+.\" square brackets
+.\"
+.de ref*end-print
+.ie d [F .IP "[\\*([F]" 2
+.el .XP
+\\*[ref*string]
+..
+.\"********************************************************************
+.\" Get journal number entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-N
+.ref*field N "" ( ) 
+..
+.\"********************************************************************
+.\" Get journal volume entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-V
+.ref*field V , "" "" ""
+..
+.\"********************************************************************
+.\" Get the date entry right.  Should not be enclosed in parentheses.
+.\"
+.de ref*add-D
+.ref*field D ","
+..
+.R1
+accumulate
+sort A+DT
+database references
+label-in-text
+label A.nD.y-2
+bracket-label [ ] ", "
+bibliography references
+.R2
+.so bios
diff --git a/performance/lmbench3/doc/userguide.ms b/performance/lmbench3/doc/userguide.ms
new file mode 100755
index 0000000..9bf3f4f
--- /dev/null
+++ b/performance/lmbench3/doc/userguide.ms
@@ -0,0 +1,3782 @@
+.\" This document is GNU groff -mgs -t -p -R -s
+.\" It will not print with normal troffs, it uses groff features, in particular,
+.\" long names for registers & strings.
+.\" Deal with it and use groff - it makes things portable.
+.\"
+.\" $X$ xroff -mgs -t -p -R -s $file
+.\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more
+.\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr
+.VARPS
+.\" Define a page top that looks cool
+.\" HELLO CARL!  To turn this off, s/PT/oldPT/
+.de PT
+.tl '\fBDRAFT\fP'\\*(DY'\fBDRAFT\fP'
+..
+.de lmPT
+.if \\n%>1 \{\
+.	sp -.1i
+.	ps 14
+.	ft 3
+.	nr big 24
+.	nr space \\w'XXX'
+.	nr titlewid \\w'\\*[title]'
+.	nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2
+.	ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25'
+.	ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0
+.	ce 1
+\\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar]
+.	ps
+.	sp -.70
+.	ps 12
+\\l'\\n[LL]u'
+.	ft
+.	ps
+.\}
+..
+.\" Define a page bottom that looks cool
+.\" HELLO CARL!  To turn this off, s/BT/oldBT/
+.de BT
+.tl '\(co 2002 \\*[author]'%'\fB\\*(DY DRAFT DO NOT DISTRIBUTE\fP'
+..
+.de lmBT
+.	ps 9
+\v'-1'\\l'\\n(LLu'
+.	sp -1
+.	tl '\(co 2002 \\*[author]'\\*(DY'%'
+.	ps
+..
+.de SP
+.	if t .sp .5
+.	if n .sp 1
+..
+.de BU
+.	SP
+.	ne 2
+\(bu\ 
+.	if \\n[.$] \fB\\$1\fP\\$2
+..
+.nr FIGURE 0
+.nr TABLE 0
+.nr SMALL .25i
+.de TSTART
+.	KF
+.	if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0
+.	ps -1
+.	vs -1
+..
+.de TEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr TABLE \\n[TABLE]+1
+.	ce 1
+\fBTable \\n[TABLE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.de FEND
+.	ps +1
+.	vs +1
+.	if \\n[.$]=2 \{\
+.	sp -.5
+\s(24\\l'\\n[pg@colw]u'\s0 \}
+.	sp .25
+.	nr FIGURE \\n[FIGURE]+1
+.	ce 1
+\fBFigure \\n[FIGURE].\ \ \\$1\fP
+.	SP
+.	KE
+..
+.\" Configuration
+.nr PI 3n
+.nr HM 1i
+.nr FM 1i
+.nr PO 1i
+.if t .po 1i
+.nr LL 6.5i
+.if n .nr PO 0i
+.if n .nr LL 7.5i
+.nr PS 10
+.nr VS \n(PS+1
+.ds title Measuring scalability
+.ds author Carl Staelin
+.ds lmbench \f(CWlmbench\fP
+.ds lmbench1 \f(CWlmbench1\fP
+.ds lmbench2 \f(CWlmbench2\fP
+.ds lmbench3 \f(CWlmbench3\fP
+.ds bcopy \f(CWbcopy\fP
+.ds benchmp  \f(CWbenchmp\fP
+.ds bw_file_rd \f(CWbw_file_rd\fP
+.ds bw_mem \f(CWbw_mem\fP
+.ds bw_mmap_rd \f(CWbw_mmap_rd\fP
+.ds bw_pipe \f(CWbw_pipe\fP
+.ds bw_tcp \f(CWbw_tcp\fP
+.ds bw_udp \f(CWbw_udp\fP
+.ds bw_unix \f(CWbw_unix\fP
+.ds connect \f(CWconnect\fP
+.ds execlp  \f(CWexeclp\fP
+.ds execve  \f(CWexecve\fP
+.ds exit \f(CWexit\fP
+.ds fcntl \f(CWfcntl\fP
+.ds fork \f(CWfork\fP
+.ds fstat \f(CWfstat\fP
+.ds gcc \f(CWgcc\fP
+.ds getpid \f(CWgetpid\fP
+.ds getppid \f(CWgetppid\fP
+.ds gettimeofday \f(CWgettimeofday\fP
+.ds kill \f(CWkill\fP
+.ds lat_connect \f(CWlat_connect\fP
+.ds lat_ctx \f(CWlat_ctx\fP
+.ds lat_fcntl \f(CWlat_fcntl\fP
+.ds lat_fifo \f(CWlat_fifo\fP
+.ds lat_fs \f(CWlat_fs\fP
+.ds lat_http \f(CWlat_http\fP
+.ds lat_mem_rd \f(CWlat_mem_rd\fP
+.ds lat_mmap \f(CWlat_mmap\fP
+.ds lat_ops \f(CWlat_ops\fP
+.ds lat_pagefault \f(CWlat_pagefault\fP
+.ds lat_pipe \f(CWlat_pipe\fP
+.ds lat_proc \f(CWlat_proc\fP
+.ds lat_rpc \f(CWlat_rpc\fP
+.ds lat_select \f(CWlat_select\fP
+.ds lat_sem \f(CWlat_sem\fP
+.ds lat_sig \f(CWlat_sig\fP
+.ds lat_syscall \f(CWlat_syscall\fP
+.ds lat_tcp \f(CWlat_tcp\fP
+.ds lat_udp \f(CWlat_udp\fP
+.ds lat_unix \f(CWlat_unix\fP
+.ds lat_unix_connect \f(CWlat_unix_connect\fP
+.ds line \f(CWline\fP
+.ds lmdd  \f(CWlmdd\fP
+.ds lmdd \f(CWlmdd\fP
+.ds memmove \f(CWmemmove\fP
+.ds mhz  \f(CWmhz\fP
+.ds mmap \f(CWmmap\fP
+.ds par_mem  \f(CWpar_mem\fP
+.ds par_ops  \f(CWpar_ops\fP
+.ds pipe  \f(CWpipe\fP
+.ds popen  \f(CWpopen\fP
+.ds read \f(CWread\fP
+.ds select  \f(CWselect\fP
+.ds semop \f(CWsemop\fP
+.ds sh  \f(CW/bin/sh\fP
+.ds stat \f(CWstat\fP
+.ds stream \f(CWstream\fP
+.ds system  \f(CWsystem\fP
+.ds tlb \f(CWtlb\fP
+.ds uiomove \f(CWuiomove\fP
+.ds write \f(CWwrite\fP
+.ds yield  \f(CWyield\fP
+.\" References stuff
+.de RN  \"Reference Name: .RN $1 -- prints the reference prettily
+.\" [\s-2\\$1\s+2]\\$2
+[\s-1\\$1\s0]\\$2
+..
+.\" .R1
+.\" sort A+DT
+.\" database references
+.\" label-in-text
+.\" label A.nD.y-2
+.\" bracket-label \*([. \*(.] ", "
+.\" .R2
+.EQ
+delim $$
+.EN
+.TL
+\s(14lmbench user guide\s0
+.AU
+\s+2\fR\*[author]\fP\s0
+.AI
+\fI\s+2Hewlett-Packard Laboratories Israel\s0\fP
+.SP
+.AB
+\*[lmbench] is a micro-benchmark suite designed to focus
+attention on the basic building blocks of many
+common system applications, such as databases, simulations, 
+software development, and networking.  
+It is also designed to make it easy for users to create
+additional micro-benchmarks that can measure features, 
+algorithms, or subsystems of particular interest to the
+user.
+.SP
+There is a timing harness, \*[benchmp], designed 
+to measure performance at specific levels of parallel 
+(simultaneous) load.
+.AE
+.if t .MC 3.05i
+.NH 1
+Introduction
+.LP
+\*[lmbench] is a widely used suite of micro-benchmarks
+that measures important aspects of computer system
+performance, such as memory latency and bandwidth.
+Crucially, the suite is written in portable ANSI-C
+using POSIX interfaces and is intended to run on a 
+wide range of systems without modification.
+.LP
+The benchmarks included in the suite were chosen
+because in the \*[lmbench] developer's experience,
+they each represent an aspect of system performance
+which has been crucial to an application's
+performance.  
+.LP
+In general the benchmarks report either the latency
+or bandwidth of an operation or data pathway.  The
+exceptions are generally those benchmarks that
+report on a specific aspect of the hardware, such
+as the processor clock rate, which is reported 
+in MHz and nanoseconds.
+.LP
+\*[lmbench] consists of three major components:
+a timing harness, the individual benchmarks
+built on top of the timing harness, and the
+various scripts and glue that build and run the 
+benchmarks and process the results.
+.NH 2
+\*[lmbench] history
+.LP
+\*[lmbench1] was written by Larry McVoy
+while he was at Sun Microsystems.  It focussed
+on two measures of system performance: latency
+and bandwidth.  It measured a number of basic
+operating system functions, such as file system
+read/write bandwidth or file creation time.  It
+also focussed a great deal of energy on measuring
+data transfer operations, such as \*[bcopy] and
+\*[pipe] latency and bandwidth as well as raw
+memory latency and bandwidth.
+.LP
+Shortly after 
+.RN McVoy96
+was published, 
+.RN Brown97
+examined the \*[lmbench] benchmarks and published
+a detailed critique of its strengths and weaknesses.
+Largely in response to these remarks, development
+of \*[lmbench2] began with a focus on
+improving the experimental design and statistical
+data analysis.  The primary change was the development
+and adoption across all the benchmarks of a timing 
+harness that incorporated loop-autosizing and clock 
+resolution detection.  In addition, each experiment
+was typically repeated eleven times with the median
+result reported to the user.
+.LP
+\*[lmbench3] focussed on extending 
+\*[lmbench]'s functionality along two dimensions:
+measuring multi-processor scalability and measuring
+basic aspects of processor architecture.
+.LP
+There are any number of aspects of a computer's
+micro-architecture that can impact a program's
+performance, such as the design of the memory
+hierarchy and the basic performance of the various
+arithmetic units.
+.LP
+All of the new benchmarks were added to \*[lmbench]
+because the author needed them to help guide his
+design decisions in one or more projects over the
+last few years.  
+For example, \*[lat_ops] was added because the
+author was trying to decide whether a particular
+image processing algorithm should be implemented
+using integer or floating point arithmetic.
+Floating point arithmetic was preferred for a
+variety of reasons, but it was feared that 
+floating point arithmetic would be prohibitively
+expensive compared to integer operations.
+By quickly building \*[lat_ops] the author was
+able to verify that the floating point performance
+should be no worse than integer performance.
+.LP
+An important feature of multi-processor systems is their
+ability to scale their performance.  \*[lmbench1]
+was able to measure various important aspects of 
+system performance, except that only one client process
+was active at a time
+.RN McVoy96 .
+\*[lmbench2] introduced a new macro, BENCH(), which
+implemented a sophisticated timing harness that
+automatically managed nearly all aspects of accurately
+timing operations
+.RN Staelin98 .
+For example, it automatically
+detects the minimal timing interval necessary to 
+provide timing results within 1% accuracy, and it
+automatically repeats most experiments eleven times
+and reports the median result.
+.LP
+However, this timing harness is incapable of measuring
+the performance of a system under scalable loads.  
+\*[lmbench3] took the ideas and techniques
+developed in the earlier versions and extended them
+to create a new timing harness which can measure
+system performance under parallel, scalable loads.
+.LP
+\*[lmbench3] also includes a version of John 
+McCalpin's STREAM benchmarks.  Essentially the STREAM 
+kernels were placed in the new \*[lmbench] timing harness.
+Since the new timing harness also measures scalability
+under parallel load, the \*[lmbench3] STREAM
+benchmarks include this capability automatically.  
+.LP
+Finally, \*[lmbench3] includes a number of new
+benchmarks which measure various aspects of the
+processor architecture, such as basic operation
+latency and parallelism, to provide developers
+with a better understanding of system capabilities.
+The hope is that better informed developers will
+be able to better design and evaluate performance
+critical software in light of their increased
+understanding of basic system performance.
+.NH 1
+Prior Work
+.LP
+Benchmarking is not a new field of endeavor.
+There are a wide variety of approaches to 
+benchmarking, many of which differ greatly
+from that taken by \*[lmbench].  
+.LP
+One common form of benchmark is to take an
+important application or application and
+worklist, and to measure the time required
+to complete the entire task.  
+This approach is particularly useful when 
+evaluating the utility of systems for a 
+single and well-known task.
+.LP
+Other benchmarks, such as SPECint, use a
+variation on this approach by measuring
+several applications and combining the
+results to predict overall performance.
+.\" .LP
+.\" XXX Byte benchmark
+.LP
+Another variation takes the "kernel" of
+an important application and measures its
+performance, where the "kernel" is usually
+a simplification of the most expensive
+portion of a program.  
+Dhrystone 
+.RN Weicker84
+is an example of this type of
+benchmark as it measures the performance
+of important matrix operations and was often
+used to predict system performance for
+numerical operations.
+.LP
+.RN Banga98
+developed a benchmark to measure HTTP server
+performance which can accurately measure
+server performance under high load.
+Due to the idiosyncracies of the HTTP protocol 
+and TCP design and implementation, there are 
+generally operating system limits on the rate 
+at which a single system can generate 
+independent HTTP requests.  
+However, 
+.RN Banga98
+developed a system which can scalably present
+load to HTTP servers in spite of this limitation.
+.LP
+John McCalpin's STREAM benchmark measures
+memory bandwidth during four common vector
+operations
+.RN McCalpin95 .
+It does not measure memory latency, and
+strictly speaking it does not measure raw
+memory bandwith although memory bandwidth
+is crucial to STREAM performance.
+More recently, work has begun on extending
+STREAM to measure scalable memory subsystem
+performance, particularly for multi-processor
+machines.
+.LP
+Uros Prestor
+.RN Prestor01
+XXX
+.LP
+Micro-benchmarking extends this "kernel" 
+approach, by measuring the performance
+of operations or resources in isolation.
+\*[lmbench] and many other benchmarks, such 
+as nfsstone
+.RN Shein89 ,
+measure the performance of key operations so 
+users can predict performance for certain 
+workloads and applications by combining the 
+performance of these operations in the right 
+mixture.
+.LP
+.RN Saavedra92
+takes the micro-benchmark approach and applies
+it to the problem of predicting application
+performance. 
+They analyze applications or other benchmarks
+in terms of their ``narrow spectrum benchmarks''
+to create a linear model of the application's
+computing requirements.  
+They then measure the computer system's 
+performance across this set of micro-benchmarks
+and use a linear model to predict the application's
+performance on the computer system.
+.RN Seltzer99
+applied this technique using the features
+measured by \*[lmbench] as the basis for
+application prediction.
+.LP
+Benchmarking I/O systems has proven particularly
+troublesome over the years, largely due to the
+strong non-linearities exhibited by disk systems.
+Sequential I/O provides much higher bandwidth
+than non-sequential I/O, so performance is 
+highly dependent on the workload characteristics
+as well as the file system's ability to 
+capitalize on available sequentiality by
+laying out data contiguously on disk.
+.LP
+I/O benchmarks have a tendency to age poorly.
+For example, IOStone
+.RN Park90a ,
+IOBench
+.RN Wolman89 ,
+and the Andrew benchmark
+.RN Howard88
+used fixed size datasets, whose size was
+significant at the time, but which no longer
+measure I/O performance as the data can now
+fit in the processor cache of many modern
+machines.
+.LP
+The Andrew benchmark attempts to separately
+measure the time to create, write, re-read, 
+and then delete a large number of files in
+a hierarchical file system.  
+.LP
+Bonnie
+.RN Bray90
+measures sequential, streaming I/O bandwidth
+for a single process, and random I/O latency
+for multiple processes.  
+.LP
+Peter Chen developed an adaptive harness for
+I/O benchmarking
+.RN Chen94a ,
+which defines I/O load in terms of five parameters,
+uniqueBytes, sizeMean, readFrac, seqFrac, and
+processNum.  The benchmark then explores the
+parameter space to measure file system performance
+in a scalable fashion.
+.NH 1
+Computer Architecture Primer
+.LP
+A processor architecture is generally defined by its
+instruction set, but most computer architectures
+incorporate a large number of common building blocks
+and concepts, such as registers, arithmetic logic
+units, and caches.
+.LP
+Of necessity, this primer over-simplifies the
+many details and variations of specific computer
+designs and architectures.  For more information,
+please see 
+.RN Hennessy96 .
+.TSTART 1
+.so lmbench3_arch.pic
+.FEND "Architecture diagram" 1
+.LP
+Figure \n[FIGURE] contains a greatly simplified block diagram
+of a computer.  Various important elements, such as
+the I/O bus and devices, have been left out.  The
+core of the processor are the registers (r0, ..., rn
+and f0, ..., fn) and the arithmetic units (ALU and FPU).
+In general, the arithmetic units can access data in
+registers ''instantly''.  Often data must be explicitly
+loaded from memory into a register before it can be
+manipulated by the arithmetic units.
+.LP
+The ALU handles integer arithmetic, such as bit
+operations (AND, OR, XOR, NOT, and SHIFT) as
+well as ADD, MUL, DIV, and MOD.  Sometimes there
+is specialized hardware to handle one or more
+operations, such as a barrel shifter for SHIFT
+or a multiplier, and sometimes there is no
+hardware support for certain operations, such
+as MUL, DIV, and MOD.  
+.LP
+The FPU handles floating point arithmetic.
+Sometimes there are separate FPUs for single
+and double precision floating point operations.
+.NH 2
+Memory Hierarchy
+.LP
+Nearly all modern, general purpose computers use
+virtual memory with phyically addressed caches.
+As such, there is typically one or more caches
+between the physical memory and the processor,
+and virtual-to-physical address translation
+occurs between the processor and the top-level
+cache.  Cache staging and replacement is done
+in \fIcache line\fR units, which are typically
+several words in length, and caches lower in 
+the hierarchy sometimes have cache lines which
+are larger than those in the higher caches.
+.LP
+Modern processors usually incorporate at least
+an L1 cache on-chip, and some are starting to
+also incorporate the L2 cache on-chip.  In
+addition, most include a translation look-aside
+buffer (TLB) on-chip for fast virtual-to-physical
+address translation.
+.LP
+One key element of any cache design is its
+replacement strategy.  Most caches use either
+direct-mapped or set associative caches.  In
+the first instance any word in physical memory
+has exactly one cache line where into which it
+may be staged, while set associative caches
+allow a given word to be cached into one of a
+set of lines.  Direct-mapped caches have a 
+very simple replacement policy: the contents
+of the line that is needed is discarded.
+Set associative caches usually use LRU or
+some variant within each set, so the least
+recently used line in the set of possible
+cache lines is replaced.  The control logic
+for direct-mapped caches is much cheaper to
+build, but they are generally only as 
+effective as a set-associative cache half
+the size
+.RN Hennessy96 .
+.LP
+Another key element of memory hierarchy design
+is the management of dirty data; at what point
+are writes passed down the memory hierarchy to
+lower caches and main memory?  The two basic
+policies are write-through and write-back.
+A write-through policy means that writes are
+immediately passed through the cache to the
+next level in the hierarchy, so the lower
+levels are updated at the same time as the
+cache.  A write-back policy means that the
+cache line is marked as dirty in the cache,
+and only when the line is ejected from the
+cache is the data passed down the hierarchy.
+Write-through policies are often used in 
+higher (smaller) caches because multi-
+processor systems need to keep a coherent
+view of memory and the writes are often
+propagated to other processors by \fIsnoopy\fR
+caches.
+.LP
+One often overlooked aspect of cache
+performance is cache behavior during
+writes.  Most cache lines contain
+several words, and most instructions
+only update the line a word at a time.
+This means that when the processor
+writes a word to a cache line that is
+not present, the cache will read the
+line from memory before completing the
+write operation.  For \*[bcopy]-like
+operations this means that the overall
+memory bandwidth requirement is actually
+two reads and one write per copied word,
+rather than the expected read and write.
+.LP
+Most modern processors now include some form
+of prefetch in the memory hierarchy.  For
+the most part these are simple systems that
+can recognize fixed strided accesses through
+memory, such as might be seen in many array
+operations.  However, prefetching systems
+appear to be growing in complexity and
+capability.
+.LP
+Additionally, modern memory subsystems can
+usually support multiple outstanding requests;
+the level of parallelism is usually dependent
+on the level of the hierarchy being accessed.
+Top-level caches can sometimes support as 
+many as six or eight outstanding requests,
+while main memory can usually support two
+outstanding requests.  Other elements of 
+the memory hierarchy, such as the TLB, often
+have additional limits on the level of
+achievable parallelism in practice.\**
+.FS 
+For example, if the TLB serializes all
+TLB misses, and if each memory access
+causes a TLB miss, then the memory
+accesses will be serialized even if
+the data was in a cache supporting
+six outstanding requests.
+.FE
+.LP
+For more information and details on memory 
+subsystem design, and computer architecture
+in general, please see
+.RN Hennessy96
+which has an excellent description of these
+and many other issues.
+.NH 1
+Timing Harness
+.LP
+The first, and most crucial element in extending
+\*[lmbench2] so that it could measure scalable
+performance, was to develop a new timing harness
+that could accurately measure performance for
+any given load.
+Once this was done, then each benchmark would
+be migrated to the new timing harness.
+.LP
+The harness is designed to accomplish a number
+of goals:
+.IP 1.
+during any timing interval of any child it is
+guaranteed that all other child processes are
+also running the benchmark
+.IP 2.
+the timing intervals are long enough to average
+out most transient OS scheduler affects
+.IP 3.
+the timing intervals are long enough to ensure
+that error due to clock resolution is negligible
+.IP 4.
+timing measurements can be postponed to allow
+the OS scheduler to settle and adjust to the
+load
+.IP 5.
+the reported results should be representative 
+and the data analysis should be robust
+.IP 6.
+timing intervals should be as short as possible
+while ensuring accurate results
+.LP
+Developing an accurate timing harness with a
+valid experimental design is more difficult 
+than is generally supposed.
+Many programs incorporate elementary timing
+harnesses which may suffer from one or more
+defects, such as insufficient care taken to
+ensure that the benchmarked operation is run
+long enough to ensure that the error introduced 
+by the clock resolution is insignificant.
+The basic elements of a good timing harness
+are discussed in 
+.RN Staelin98 .
+.LP
+The new timing harness must also collect and process
+the timing results from all the child processes so
+that it can report the representative performance.
+It currently reports the median performance over
+all timing intervals from all child processes.  It
+might perhaps be argued that it should report the
+median of the medians.
+.LP
+Most of the benchmarks now accept a "-P <parallelism>"
+flag, and the timing harness does the right thing to
+try and measure parallel application performance.
+.LP
+When running benchmarks with more than one child,
+the harness must first get a baseline estimate
+of performance by running the benchmark in only
+one process using the standard \*[lmbench] timing
+interval, which is often 5,000 micro-seconds.
+Using this information, the harness can compute
+the average time per iteration for a single
+process, and it uses this figure to compute the
+number of iterations necessary to ensure that
+each child runs for at least one second.
+.NH 2
+Clock resolution
+.LP
+\*[lmbench] uses the \*[gettimeofday] clock, whose 
+interface resolves time down to 1 micro-second.  
+However, many system clock's resolution is only 10 
+milli-seconds, and there is no portable way to query 
+the system to discover the true clock resolution.
+.LP
+The problem is that the timing intervals must
+be substantially larger than the clock resolution
+in order to ensure that the timing error doesn't
+impact the results.  For example, the true duration
+of an event measured with a 10 milli-second clock
+can vary $+-$10 milli-seconds from the true time,
+assuming that the reported time is always a
+truncated version of the true time.  If the clock
+itself is not updated precisely, the true error
+can be even larger.  
+This implies that timing intervals on these systems
+should be at least 1 second.
+.LP
+However, the \*[gettimeofday] clock resolution in
+most modern systems is 1 micro-second, so timing
+intervals can as small as a few milli-seconds
+without incurring significant timing errors related
+to clock resolution.
+.LP
+Since there is no standard interface to query the operating
+system for the clock resolution, \*[lmbench] must 
+experimentally determine the appropriate timing 
+interval duration which provides results in a timely 
+fashion with a negligible clock resolution error.
+.NH 2
+Coordination
+.LP
+Developing a timing harness that correctly manages 
+$N$ processes and accurately measures system performance 
+over those same $N$ processes is significantly more difficult
+than simply measuring system performance with a single
+process because of the asynchronous nature of
+parallel programming.
+.LP
+In essence, the new timing harness needs to create
+$N$ jobs, and measure the average performance of the
+target subsystem while all $N$ jobs are running.  This
+is a standard problem for parallel and distributed
+programming, and involves starting the child
+processes and then stepping through a handshaking
+process to ensure that all children have started
+executing the benchmarked operation before any child
+starts taking measurements.
+.TSTART 1
+.TS
+box tab (/) allbox expand ;
+c c
+l l .
+Parent/Child
+T{
+start up P child processes
+T}/T{
+run benchmark operation for a little while
+T}
+T{
+wait for P "ready" signals
+T}/T{
+send a "ready" signal
+T}
+T{
+[sleep for "warmup" microseconds]
+T}/T{
+run benchmark operation while polling for a "go" signal
+T}
+T{
+send "go" signal to P children
+T}/T{
+begin timing benchmark operation
+T}
+T{
+wait for P "done" signals
+T}/T{
+send a "done" signal
+T}
+T{
+for each child, send "results" signal and gather results
+T}/T{
+run benchmark operation while polling for a "results" signal
+T}
+T{
+collate results
+T}/T{
+send timing results and wait for "exit" signal
+T}
+T{
+send "exit" signal
+T}/T{
+exit
+T}
+.TE
+.TEND "Timing harness sequencing"
+.LP
+Table \n[TABLE] shows how the parent and child
+processes coordinate their activities to ensure
+that all children are actively running the
+benchmark activity while any child could be
+taking timing measurements.
+.LP
+.NH 2
+Accuracy
+.LP
+The new timing harness also needs to ensure that the 
+timing intervals are long enough for the results to 
+be representative.  The previous timing harness assumed
+that only single process results were important, and
+it was able to use timing intervals as short as
+possible while ensuring that errors introduced by
+the clock resolution were negligible.  
+In many instances this meant that the timing intervals 
+were smaller than a single scheduler time slice.  
+The new timing harness must run benchmarked items 
+long enough to ensure that timing intervals are longer
+than a single scheduler time slice.
+Otherwise, you can get results which are complete nonsense.  
+For example, running several copies of an \*[lmbench2] 
+benchmark on a uni-processor machine will often report 
+that the performance with $N$ jobs running in parallel 
+is equivalent to the performance with a single job running!\**
+.FS
+This was discovered by someone who naively attempted
+to parallelize \*[lmbench2] in this fashion, and I
+received a note from the dismayed developer describing
+the failed experiment.
+.FE
+.LP
+In addition, since the timing intervals now have to be
+longer than a single scheduler time slice, they also
+need to be long enough so that a single scheduler time
+slice is insignificant compared to the timing interval.
+Otherwise the timing results can be dramatically 
+affected by small variations in the scheduler's
+behavior.
+.NH 2
+Resource consumption
+.LP
+One important design goal was that resource consumption
+be constant with respect to the number of child
+processes.  
+This is why the harness uses shared pipes to communicate
+with the children, rather than having a separate set of
+pipes to communicate with each child.
+An early design of the system utilized a pair of pipes
+per child for communication and synchronization between
+the master and slave processes.  However, as the number
+of child processes grew, the fraction of system 
+resources consumed by the harness grew and the additional
+system overhead could start to interfere with the accuracy 
+of the measurements.
+.LP
+Additionally, if the master has to poll (\*[select])
+$N$ pipes, then the system overhead of that operation
+also scales with the number of children.  
+.NH 2
+Pipe atomicity
+.LP
+Since all communication between the master process and
+the slave (child) processes is done via a set of shared
+pipes, we have to ensure that we never have a situation
+where the message can be garbled by the intermingling
+of two separate messages from two separate children.
+This is ensured by either using pipe operations that
+are guaranteed to be atomic on all machines, or by
+coordinating between processes so that at most one
+process is writing at a time.
+.LP
+The atomicity guarantees are provided by having each
+client communicate synchronization states in one-byte 
+messages.  For example, the signals from the master
+to each child are one-byte messages, so each child
+only reads a single byte from the pipe.  Similarly,
+the responses from the children back to the master
+are also one-byte messages.  In this way no child
+can receive partial messages, and no message can
+be interleaved with any other message.
+.LP
+However, using this design means that we need to
+have a separate pipe for each \fIbarrier\fR in
+the process, so the master uses three pipes to
+send messages to the children, namely: \fIstart_signal\fR,
+\fIresult_signal\fR, and \fIexit_signal\fR.
+If a single pipe was used for all three barrier events,
+then it is possible for a child to miss a signal,
+or if the signal is encoded into the message, 
+then it is possible for a child to infinite loop
+pulling a signal off the pipe, recognizing that
+it has already received that signal so that it
+needs to push it back into the pipe, and then
+then re-receiving the same message it just re-sent.
+.LP
+However, all children share a single pipe to send
+data back to the master process.  Usually the
+messages on this pipe are single-byte signals,
+such as \fIready\fR or \fIdone\fR.  However, the
+timing data results need to be sent from the
+children to the master and they are (much) larger
+than a single-byte message.  In this case, the
+timing harness sends a single-byte message on
+the \fIresult_signal\fR channel, which can be
+received by at most one child process.  This
+child then knows that it has sole ownership of
+the response pipe, and it writes its entire 
+set of timing results to this pipe.  Once the
+master has received all of the timing results
+from a single child, it sends the next one-byte
+message on the \fIresult_signal\fR channel to
+gather the next set of timing results.
+.TSTART 1
+.so lmbench3_signals.pic
+.FEND "Control signals" 1
+.LP
+The design of the signals is shown in Figure \n[FIGURE].
+.NH 2
+Benchmark initialization
+.LP
+By allowing the benchmark to specify an
+initialization routine that is run in the
+child processes, the new timing harness
+allows benchmarks to do either or both
+global initializations that are shared
+by all children and specific per-child
+initializations that are done independently
+by each child.
+Global initialization is done in the
+master process before the \*[benchmp] 
+harness is called, so the state is 
+preserved across the \*[fork] operations.
+Per-child initialization is done inside
+the \*[benchmp] harness by the optional
+initialization routine and is done after
+the \*[fork] operation.
+.LP
+Similarly, each benchmark is allowed to
+specify a cleanup routine that is run by
+the child processes just before exiting.
+This allows the benchmark routines to
+release any resources that they may have
+used during the benchmark.
+Most system resources would be automatically
+released on process exit, such as file
+descriptors and shared memory segments,
+but some resources such as temporary files
+might need to be explicitly released by
+the benchmark.
+.NH 2
+Scheduler transients
+.LP
+Particularly on multi-processor systems, side-effects
+of process migration can dramatically affect program 
+runtimes.  For example, if the processes are all
+initially assigned to the same processor as the parent
+process, and the timing is done before the scheduler
+migrates the processes to other available processors,
+then the system performance will appear to be that of
+a uniprocessor.  Similarly, if the scheduler is
+over-enthusiastic about re-assigning processes to
+processors, then performance will be worse than
+necessary because the processes will keep encountering
+cold caches and will pay exhorbitant memory access
+costs.
+.LP
+The first case is a scheduler transient, and users
+may not want to measure such transient phenomena
+if their primary interest is in predicting performance
+for long-running programs.  Conversely, that same
+user would be extraordinarily interested in the
+second phenomena.  The harness was designed to
+allow users to specify that the benchmarked processes
+are run for long enough to (hopefully) get the
+scheduler past the transient startup phase, so it
+can measure the steady-state behavior.
+.NH 2
+Data analysis
+.LP
+Analyzing the data to produce representative results
+is a crucial step in the benchmarking process.  
+\*[lmbench] generally reports the \fImedian\fP
+result for $11$ measurements.  
+Most benchmarks report the results of a single measurement
+.RN Howard88 ,
+an average of several results
+.RN McCalpin95 ,
+or a trimmed mean
+.RN Brown97 .
+XXX UNKNOWN:
+.RN Weicker84,Shein89,Park,Wolman89,Banga97,Saavedra92,Chen94a,Bray90
+.LP
+Since \*[lmbench] is able to use timing intervals
+that are often smaller than a scheduler time slice,
+the raw timing results are often severely skewed.
+The median is preferable to the mean when the data
+can be very skewed
+.RN Jain91 .
+.LP
+In some instances, however, \*[lmbench] internally
+uses the \fIminimum\fP rather than the median, 
+such as in \*[mhz].  
+In those instances, we are not trying to find the 
+\fIrepresentative\fP value, but rather the 
+\fIminimum\fP value.
+There are only a few sources of error which could
+cause a the measured timing result to be shorter 
+than the true elapsed time: the system clock is
+adjusted, or round-off error in the clock resolution.
+The timing interval duration is set to ensure that
+the round-off error is bounded to 1% of the timing
+interval, and we blithely assume that people don't
+reset their system clocks while benchmarking their
+systems.
+.LP
+\*[lmbench] does not currently report any statistics
+representing measurement variation, such as the 
+difference between the first and third quartiles.
+.NH 1
+Interface
+.LP
+Unfortunately we had to move away from the
+macro-based timing harness used in \*[lmbench2] 
+and migrate to a function-based system.  
+.LP
+The new interface looks like:
+.DS
+typedef void (*bench_f)(uint64 iterations, 
+			void* cookie);
+typedef void (*support_f)(void* cookie);
+
+extern void benchmp(support_f initialize,
+		bench_f benchmark,
+		support_f cleanup,
+		int enough,
+		int parallel,
+		int warmup,
+		int repetitions,
+		void* cookie);
+.DE
+.LP
+A brief description of the parameters:
+.IP \fIenough\fR
+Enough can be used to ensure that a timing interval is at
+least 'enough' microseconds in duration.  For most benchmarks
+this should be zero, but some benchmarks have to run for more
+time due to startup effects or other strange behavior.
+.IP \fIparallel\fR
+is simply the number of instances of the benchmark
+that will be run in parallel on the system.  
+.IP \fIwarmup\fR
+can be used to force the benchmark to run for warmup
+microseconds before the system starts making timing measurements.
+Note that it is a lower bound, not a fixed value, since it
+is simply the time that the parent sleeps after receiving the
+last "ready" signal from each child (and before it sends 
+the "go" signal to the children).  
+.IP \fIrepetitions\fR
+is the number of times the experiment should
+be repeated.  The default is eleven.
+.IP \fIcookie\fR
+is a pointer that can be used by the benchmark
+writer to pass in configuration information, such as buffer
+size or other parameters needed by the inner loop.  
+In \*[lmbench3] it is generally used to point
+to a structure containing the relevant configuration
+information.
+.LP
+To write a simple benchmark for getppid() all you would need
+to do is:
+.DS
+void
+benchmark_getppid(uint64 iterations, 
+			void* cookie)
+{
+	while (iterations-- > 0) {
+		getppid();
+	}
+}
+.DE
+.LP
+and then somewhere in your program you might call:
+.DS
+benchmp(NULL, benchmark_getppid, NULL, 
+	0, 1, 0, NULL);
+micro("getppid", get_n());
+.DE
+.LP
+A more complex example which has "state" and uses the 
+initialization and cleanup capabilities might look something
+like this:
+.DS
+struct bcopy_state {
+	int len;
+	char* src;
+	char* dst;
+};
+.DE
+.DS
+void
+initialize_bcopy(void* cookie)
+{
+	struct bcopy_state* state = 
+		(struct bcopy_state*)cookie;
+
+	state->src = valloc(state->len);
+	state->dst = valloc(state->len);
+
+	bzero(src, state->len);
+	bzero(src, state->len);
+}
+.DE
+.DS
+void
+benchmark_bcopy(uint64 iterations, 
+		void* cookie)
+{
+	struct bcopy_state* state = 
+		(struct bcopy_state*)cookie;
+
+	while (iterations-- > 0) {
+		bcopy(state->src, 
+		      state->dst, state->len);
+	}
+}
+.DE
+.DS
+void
+cleanup_bcopy(void* cookie)
+{
+	struct bcopy_state* state = 
+		(struct bcopy_state*)cookie;
+
+	free(state->src);
+	free(state->dst);
+}
+.DE
+.LP
+and then your program look something like:
+.DS
+#include "bench.h"
+int
+main()
+{
+	struct bcopy_state state;
+
+	state.len = 8 * 1024 * 1024;
+	benchmp(initialize_bcopy, 
+		benchmark_bcopy, 
+		cleanup_bcopy, 
+		0, 1, 0, TRIES, &state);
+	fprintf(stderr, "bcopy: ");
+	mb(state.len * get_n());
+	exit(0);
+}
+.DE
+.LP
+Note that this particular micro-benchmark would measure
+cache-to-cache \*[bcopy] performance unless the amount of
+memory being copied was larger than half the cache size.
+A slightly more sophisticated approach might allocate
+as much memory as possible and then \*[bcopy] from one
+segment to another, changing segments within the allocated
+memory before each \*[bcopy] to defeat the caches.
+.NH 1
+Benchmarks
+.LP
+\*[lmbench] contains a large number of micro-benchmarks
+that measure various aspects of hardware and operating
+system performance.  The benchmarks generally measure
+latency or bandwidth, but some new benchmarks also
+measure parallelism.
+.TSTART
+.TS
+center box tab (&);
+c c 
+l & l .
+Name&Measures
+_
+&Bandwidth
+bw_file_rd&T{
+\*[read] and then load into processor
+T}
+bw_mem&T{
+read, write, and copy data to/from memory
+T}
+bw_mmap_rd&read from \*[mmap]'ed memory
+bw_pipe&\*[pipe] inter-process data copy
+bw_tcp&TCP inter-process data copy
+bw_unix&UNIX inter-process
+_
+&Latency
+lat_connect&TCP socket connection
+lat_ctx&T{
+context switch via \*[pipe]-based ``hot-potato'' token passing
+T}
+lat_fcntl&\*[fcntl] operation
+lat_fifo&T{
+FIFO ``hot-potato'' token passing
+T}
+lat_fs&file creation and deletion
+lat_http&http GET request latency
+lat_mem_rd&memory read
+lat_mmap&\*[mmap] operation
+lat_ops&basic operations
+lat_pagefault&page fault handler
+lat_pipe&\*[pipe] ``hot-potato'' token passing
+lat_proc&T{
+procedure call overhead and process creation using \*[fork],
+\*[fork] and \*[execve], and \*[fork] and \*[sh]
+T}
+lat_rpc&SUN RPC procedure call
+lat_select&\*[select]
+lat_sem&T{
+semaphore ``hot-potato'' token passing
+T}
+lat_sig&T{
+signal handle installation and handling
+T}
+lat_syscall&\*[getppid], \*[write], \*[stat], \*[fstat], \*[open], \*[close]
+lat_tcp&TCP ``hot-potato'' token passing
+lat_udp&UDP ``hot-potato'' token passing
+lat_unix&UNIX ``hot-potato'' token passing
+lat_unix_connect&UNIX socket connection
+_
+&Parallelism
+par_mem&memory subsystem
+par_ops&T{
+instruction-level parallelism of basic arithmetic operations
+T}
+_
+mhz&CPU clock frequency
+line&cache line size
+tlb&number of pages mapped by TLB
+stream&STREAM clones
+lmdd&\fIdd\fR clone
+.TE
+.TEND "\*[lmbench] micro-benchmarks"
+.LP
+Table \n[TABLE] contains the full list of micro-benchmarks
+in \*[lmbench].
+.NH 2
+Bandwidth
+.LP
+.LP
+By bandwidth, we mean the rate at which a particular facility can move
+data.  
+We attempt to measure the data movement ability of a number of
+different facilities:
+library \*[bcopy],
+hand-unrolled \*[bcopy],
+direct-memory read and write (no copying),
+pipes,
+TCP sockets,
+the \*[read] interface,
+and
+the \*[mmap] interface.
+.NH 2
+Memory bandwidth
+.LP
+Data movement is fundamental to any operating system.  
+In the past, performance
+was frequently measured in MFLOPS because floating point units were
+slow enough that microprocessor systems were
+rarely limited by memory bandwidth.  Today, floating point units are usually much
+faster than memory bandwidth, so many current MFLOP ratings can not be 
+maintained using memory-resident data; they are ``cache only'' ratings.
+.LP
+We measure the ability to
+copy, read, and write data over a varying set of sizes.
+There are too many results to report all of them here, so we concentrate on 
+large memory transfers.
+.LP
+We measure copy bandwidth two ways.  The first is the user-level library
+\*[bcopy] interface.
+The second is a hand-unrolled loop that loads and stores
+aligned 8-byte words.  
+In both cases, we took care to
+ensure that the source and destination locations would not map to the same
+lines if the any of the caches were direct-mapped.  
+In order to test memory bandwidth rather than cache bandwidth, 
+both benchmarks copy an 8M\** area to another 8M area.  
+(As secondary caches reach 16M, these benchmarks will have to
+be resized to reduce caching effects.)
+.FS
+Some of the PCs had less than 16M of available memory;
+those machines copied 4M. 
+.FE
+.LP
+The copy results actually represent one-half to one-third of the memory
+bandwidth used to obtain those results since we are reading and writing
+memory.  If the cache line size is larger than the word stored, then
+the written cache line will typically be read before it is written.  The
+actual amount of memory bandwidth used varies because some architectures
+have special instructions specifically designed for the \*[bcopy]
+function.  Those architectures will move twice as much memory as
+reported by this benchmark; less advanced architectures move three
+times as much memory: the memory read, the memory read because it is
+about to be overwritten, and the memory written.
+.LP
+The \*[bcopy] results reported in Table 2
+may be correlated with John McCalpin's \*[stream]
+.RN McCalpin95
+benchmark results in the following manner:
+the \*[stream] benchmark reports all of the memory moved
+whereas the \*[bcopy] benchmark reports the bytes copied.  So our
+numbers should be approximately one-half to one-third of his numbers.
+.LP
+Memory reading is measured by an unrolled loop that sums up a series of
+integers.  On most (perhaps all) systems measured the integer
+size is 4 bytes.  The loop is unrolled such that most compilers generate
+code that uses a constant offset with the load, resulting in a load and
+an add for each word of memory.  The add is an integer add that completes
+in one cycle on all of the processors.  Given that today's processor 
+typically cycles at 10 or fewer nanoseconds (ns) and that memory is typically 200-1,000
+ns per cache line, the results reported here should be dominated by the
+memory subsystem, not the processor add unit.
+.LP
+The memory contents are added up because almost all C compilers
+would optimize out the whole loop when optimization was turned on, and
+would generate far too many instructions without optimization.
+The solution is to
+add up the data and pass the result as an unused argument to the 
+``finish timing'' function.  
+.LP
+Memory reads represent about one-third to one-half of the \*[bcopy] work, and we expect
+that pure reads should run at roughly twice the speed of \*[bcopy].
+Exceptions to this rule should be studied, for exceptions indicate a bug
+in the benchmarks, a problem in \*[bcopy], or some unusual hardware.  
+.TSTART
+.so bw_allmem.tbl
+.TEND "Memory bandwidth (MB/s)"
+.LP
+Memory writing is measured by an unrolled loop that stores a value into
+an integer (typically a 4 byte integer) and then increments the pointer.
+The processor cost of each memory operation is approximately the same
+as the cost in the read case.
+.LP
+The numbers reported in Table \n[TABLE]
+are not the raw hardware speed in some cases.
+The Power2\** is capable of up to 800M/sec read rates 
+.FS
+Someone described this machine as a $1,000 processor on a $99,000 memory
+subsystem.
+.FE
+.RN McCalpin95
+and HP PA RISC (and other prefetching)
+systems also do better if higher levels of code optimization used
+and/or the code is hand tuned.
+.LP
+The Sun libc bcopy in Table \n[TABLE] 
+is better because they use a hardware specific bcopy
+routine that uses instructions new in SPARC V9 that were added specifically
+for memory movement.
+.LP
+The Pentium Pro read rate in Table \n[TABLE] is much higher than the write rate because,
+according to Intel, the write transaction turns into a read followed by
+a write to maintain cache consistency for MP systems.
+.NH 2
+IPC bandwidth
+.LP
+Interprocess communication bandwidth is frequently a performance issue.
+Many Unix applications are composed of several processes communicating
+through pipes or TCP sockets.  Examples include the \f(CWgroff\fP documentation
+system that prepared this paper, the \f(CWX Window System\fP, remote file access,
+and \f(CWWorld Wide Web\fP servers.
+.LP
+Unix pipes are an interprocess communication mechanism implemented as 
+a one-way byte stream. Each end of the stream has an associated file
+descriptor; one is the write descriptor and the other the read
+descriptor.
+TCP sockets are similar
+to pipes except they are bidirectional and can cross machine 
+boundaries.
+.LP
+Pipe bandwidth is measured by creating two processes, a writer and a
+reader, which transfer 50M of data in 64K transfers.
+The transfer size was chosen so that the overhead of system calls
+and context switching would not dominate the benchmark time.
+The reader prints the timing results, which guarantees that all
+data has been moved before the timing is finished.
+.LP
+TCP bandwidth is measured similarly, except the data is transferred in
+1M page aligned transfers instead of 64K transfers.  If the TCP
+implementation supports it, the send and receive socket buffers are
+enlarged to 1M, instead of the default 4-60K.  We have found that
+setting the transfer size equal to the socket buffer size produces the
+greatest throughput over the most implementations.
+.TSTART
+.so bw_ipc.tbl
+.TEND "Pipe and local TCP bandwidth (MB/s)"
+.LP
+\*[bcopy] is important to this test because the 
+pipe write/read is typically implemented as a \*[bcopy] into the kernel
+from the writer and then a \*[bcopy] from the kernel to the reader.  
+Ideally, these results would be approximately one-half of the 
+\*[bcopy] results.  It is possible for the kernel \*[bcopy]
+to be faster than the C library \*[bcopy] since the kernel may have 
+access to \*[bcopy] hardware unavailable to the C library.
+.LP
+It is interesting to compare pipes with TCP because the TCP benchmark is
+identical to the pipe benchmark except for the transport mechanism.  
+Ideally, the TCP bandwidth would be as good as the pipe
+bandwidth.  It is not widely known that the
+majority of the TCP cost is in the \*[bcopy], the checksum,
+and the network interface driver. 
+The checksum and the driver may be safely eliminated in the loopback
+case and if the costs have been eliminated, then TCP should be just as
+fast as pipes.  From the pipe and TCP results in Table \n[TABLE], it is easy to
+see that Solaris and HP-UX have done this optimization.
+.LP
+Bcopy rates in Table \n[TABLE] can be lower than pipe rates because the
+pipe transfers are done in 64K buffers, a size that frequently fits in
+caches, while the bcopy is typically an 8M-to-8M copy, which does not
+fit in the cache.
+.LP
+In Table \n[TABLE], the SGI Indigo2, a uniprocessor, does better than
+the SGI MP on pipe bandwidth because of caching effects - in the UP
+case, both processes share the cache; on the MP, each process is
+communicating with a different cache.
+.LP
+All of the TCP results in Table \n[TABLE] are in loopback mode \(em that
+is both ends of the socket are on the same machine.  It was impossible
+to get remote networking results for all the machines included in this
+paper.  We are interested in receiving more results for identical
+machines with a dedicated network connecting them.  The results we have
+for over the wire TCP bandwidth are shown below.
+.TSTART
+.so bw_tcp.tbl
+.TEND "Remote TCP bandwidth (MB/s)"
+.LP
+The SGI using 100MB/s Hippi is by far the fastest in Table \n[TABLE].
+The SGI Hippi interface has hardware support for TCP checksums and 
+the IRIX operating system uses virtual memory tricks to avoid copying 
+data as much as possible.
+For larger transfers, SGI Hippi has reached 92MB/s over TCP.
+.LP
+100baseT is looking quite competitive when compared to FDDI in Table
+\n[TABLE], even though FDDI has packets that are almost three times
+larger.  We wonder how long it will be before we see gigabit ethernet
+interfaces.
+.NH 2
+Cached I/O bandwidth
+.LP
+Experience has shown us that reusing data in the file system
+page cache can be a performance issue.  This 
+section measures that operation through two interfaces, \*[read] and
+\*[mmap].   
+The benchmark here is not an I/O benchmark in that no disk activity is
+involved.
+We wanted to measure the overhead
+of reusing data, an overhead that is CPU intensive, rather than disk intensive.
+.LP
+The \*[read] interface copies data from the kernel's file system page cache into the
+process's buffer, using 64K buffers.  The transfer size was chosen 
+to minimize the kernel entry overhead while
+remaining realistically sized.
+.LP
+The difference between the \*[bcopy] and the \*[read] benchmarks
+is the cost of the file and virtual memory system overhead.  In most
+systems, the \*[bcopy] speed should be faster than the \*[read] speed.  The
+exceptions usually have hardware specifically designed
+for the \*[bcopy] function and that hardware may be available only to
+the operating system.  
+.LP
+The \*[read] benchmark is implemented by rereading a file
+(typically 8M) in 64K
+buffers.  Each buffer is summed as a series of integers in the user
+process.  The summing is done for two reasons: for an apples-to-apples
+comparison the memory-mapped benchmark needs to touch all the data,
+and the file system can sometimes transfer data into memory faster than the
+processor can read the data.
+For example, \s-1SGI\s0's XFS can move data into memory at
+rates in excess of 500M per second, but it can move data into
+the cache at only 68M per second.  The intent is to measure performance
+delivered to the application, not DMA performance to memory.
+.TSTART
+.so bw_reread2.tbl
+.TEND "File vs. memory bandwidth (MB/s)"
+.LP
+The \*[mmap] interface provides a way to access the kernel's file cache 
+without copying the data.  
+The \*[mmap] benchmark is implemented by mapping the entire file (typically 8M)
+into the
+process's address space.  The file is then summed to force the data
+into the cache.
+.LP
+In Table \n[TABLE], 
+a good system will have \fIFile read\fP as fast as (or even faster than)
+\fILibc bcopy\fP because as the file system overhead goes to zero, the
+file reread case is virtually the same as the library \*[bcopy] case.
+However, file reread can be faster because the kernel may have access to
+\*[bcopy] assist hardware not available to the C library.  
+Ideally, \fIFile mmap\fP performance should approach \fIMemory read\fP
+performance, but \*[mmap] is often dramatically worse.  
+Judging by the results, this looks to be a 
+potential area for operating system improvements.
+.LP
+In Table \n[TABLE] the Power2 does better on file reread than bcopy because it takes
+full advantage of the memory subsystem from inside the kernel.  
+The mmap reread is probably slower because of the lower clock rate;
+the page faults start to show up as a significant cost.   
+.LP
+It is surprising that the Sun Ultra1 was able to bcopy at the high
+rates shown in Table 2 but did not show those rates for file reread
+in Table \n[TABLE].
+HP has the opposite problem, they get file reread faster than bcopy,
+perhaps because the kernel \*[bcopy] has access to hardware support.
+.LP
+The Unixware system has outstanding mmap reread rates, better than
+systems of substantially higher cost.  Linux needs to do some work on
+the \f(CWmmap\fP code.
+.NH 2
+Latency
+.LP
+Latency is an often-overlooked
+area of performance problems, possibly because resolving latency issues
+is frequently much harder than resolving bandwidth issues.  For example,
+memory bandwidth may be increased by making wider cache lines and increasing
+memory ``width'' and interleave,
+but memory latency can be improved only by shortening paths or increasing
+(successful) prefetching.  
+The first step toward improving latency is understanding the 
+current latencies in a system.
+.LP
+The latency measurements included in this suite are
+memory latency,
+basic operating system entry cost,
+signal handling cost,
+process creation times,
+context switching,
+interprocess communication,
+.\" virtual memory system latency,
+file system latency, 
+and disk latency.
+.NH 2 
+Memory read latency background
+.LP
+In this section, we expend considerable effort to define the different memory
+latencies and to explain and justify our benchmark.
+The background is a bit tedious but important, since we believe the
+memory
+latency measurements to be one of the most thought-provoking and useful
+measurements in \*[lmbench].  
+.LP
+The most basic latency measurement is memory latency since most of
+the other latency measurements can be expressed in terms of memory
+latency.  For example, context switches require saving the current
+process state and loading the state of the next process.  However, memory
+latency is rarely accurately measured and frequently misunderstood.
+.LP
+Memory read latency has many definitions;
+the most common,
+in increasing time order,
+are memory chip cycle time, processor-pins-to-memory-and-back time,
+load-in-a-vacuum time, and back-to-back-load time.  
+.BU "Memory chip cycle latency" :
+Memory chips are rated in nanoseconds; typical speeds are around 60ns.
+A general overview on DRAM architecture may be found in
+.RN Hennessy96 .
+The 
+specific information we describe here is from 
+.RN Toshiba94  
+and pertains to the \s-1THM361020AS-60\s0 module and \s-1TC514400AJS\s0
+\s-1DRAM\s0 used in \s-1SGI\s0 workstations.  The 60ns time is the
+time from 
+.ps -1
+.nr width \w'R\&A\&S'
+.nr height \n[rst]+1000
+RAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u'
+.ps
+assertion to the when 
+the data will be available on the \s-1DRAM\s0 pins (assuming 
+.ps -1
+.nr width \w'C\&A\&S'
+.nr height \n[rst]+1000
+CAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u'
+.ps
+access time requirements were met).
+While it is possible
+to get data out of a \s-1DRAM\s0 in 60ns, that is not all of
+the time involved.  There is a precharge time that must occur after
+every access.
+.RN Toshiba94
+quotes 110ns as the random read or write cycle time and this
+time is more representative of the cycle time.  
+.\" For example, most systems offer a wide range of memory 
+.\" capacity, from 64MB to 1GB or more.  If 64MB simms are used, the number
+.\" of simms range from 1 to 16.  The more simms there are, the more 
+.\" capacitance there is in the memory subsystem.  More capacitance means
+.\" longer setup times for the fully populated memory subsystem.  System
+.\" designers have to allow time for this setup.
+.\" For more details, consult [XXX - reference on DRAM].
+.\" This is sometimes referred to as the chip latency.  The
+.\" chip cycle time is the chip latency plus the time required to restore
+.\" the data in the capacitors which is often referred to as the precharge
+.\" time.  This means that 60 nanosecond memory chips really are more like
+.\" 100 nanosecond memory chips.  Some systems operate memory in ``page
+.\" mode'' or ``static column'' memory systems hold either RAS or CAS and
+.\" allow subsequent accesses in the same row or column in one cycle instead
+.\" of two.
+.BU "Pin-to-pin latency" :
+This number represents the time needed
+for the memory request to travel from the processor's pins to the memory
+subsystem and back again.  Many vendors have used the pin-to-pin
+definition of memory latency in their reports.  For example, 
+.RN Fenwick95 
+while describing the \s-1DEC\s0 8400
+quotes memory latencies of 265ns; a careful
+reading of that paper shows that these are pin-to-pin numbers.  In spite
+of the historical precedent in vendor reports, this definition of memory
+latency is misleading since it ignores actual delays seen when a load
+instruction is immediately followed by a use of the data being loaded.
+The number of additional cycles inside the processor can be significant
+and grows more significant with today's highly pipelined architectures.
+.LP
+It is worth noting that the pin-to-pin numbers 
+include the amount of time it takes to charge
+the lines going to the \s-1SIMM\s0s, a time that increases with the
+(potential) number of \s-1SIMM\s0s in a system.  More \s-1SIMM\s0s mean
+more capacitance which requires in longer charge times.  This is one reason
+why personal computers frequently have better memory latencies than
+workstations: the PCs typically have less memory capacity.
+.BU "Load-in-a-vacuum latency" :
+A load in a vacuum is the time that the processor will wait for one load that
+must be fetched from main memory (i.e., a cache miss).  The ``vacuum'' 
+means that there is no other activity on the system bus, including no other
+loads.  
+While this number is frequently used as the memory latency, it is not very
+useful.  It is basically a ``not to exceed'' number important only for
+marketing reasons.
+Some architects point out that since most processors implement nonblocking
+loads (the load does not cause a stall until the data is used), the perceived
+load latency may be much less that the real latency.  When pressed, however,
+most will admit that cache misses occur in bursts, resulting in perceived
+latencies of at least the load-in-a-vacuum latency.
+.BU "Back-to-back-load latency" :
+Back-to-back-load latency is the time that each load takes, assuming
+that the instructions before and after are also cache-missing loads.
+Back-to-back loads may take longer than loads in a vacuum for the
+following reason: many systems implement something known as \fIcritical
+word first\fP, which means that the subblock of the cache line that
+contains the word being loaded is delivered to the processor before the
+entire cache line has been brought into the cache.  If another load
+occurs quickly enough after the processor gets restarted from the
+current load, the second load may stall because the cache is still
+busy filling the cache line for the previous load.  On some systems,
+such as the current implementation of UltraSPARC, 
+the difference between back to back and load in a vacuum is about 35%.
+.LP
+\*[lmbench] measures back-to-back-load latency because it is the
+only measurement that may be easily measured from software and
+because we feel that it is what most software developers consider to be memory
+latency.  Consider the following C code fragment:
+.DS
+.nf
+.ft CW
+p = head;
+while (p->p_next)
+	p = p->p_next;
+.ft
+.fi
+.DE
+On a \s-1DEC\s0 Alpha, the loop part turns into three instructions, including the
+load.  A 300 Mhz processor has a 3.33ns cycle time, so the loop
+could execute in slightly less than 10ns.  However, the load itself
+takes 400ns on a 300 Mhz \s-1DEC\s0 8400.  In other words, the 
+instructions cost 10ns but the load stalls for 400.  Another
+way to look at it is that 400/3.3, or 121, nondependent,
+nonloading instructions following the load would be needed 
+to hide the load latency.  
+Because superscalar processors typically execute multiple operations
+per clock cycle, they need even more useful operations between cache
+misses to keep the processor from stalling.
+.LP
+This benchmark illuminates the tradeoffs in processor cache design.
+Architects like large cache lines, up to 64 bytes or so, because 
+the prefetch effect of gathering a whole line increases
+hit rate given reasonable spatial locality.
+Small stride sizes have high spatial locality and should have higher
+performance, but large stride sizes have poor spatial locality causing
+the system to prefetch useless data.
+So the benchmark provides the following insight into negative 
+effects of large line prefetch:
+.BU 
+Multi-cycle fill operations are typically atomic events at the 
+caches, and sometimes block other cache accesses until they
+complete.
+.BU
+Caches are typically single-ported. Having a large line prefetch
+of unused data causes extra bandwidth
+demands at the cache, and can cause increased access latency for 
+normal cache accesses.
+.LP
+In summary, we believe that processors are so fast that the average
+load latency for cache misses will be closer to the
+back-to-back-load number than to the load-in-a-vacuum number.  We are
+hopeful that the industry will standardize on this definition of
+memory latency.
+.NH 2 
+Memory read latency
+.LP
+The entire memory hierarchy can be measured, including on-board data 
+cache latency and size, external data cache latency and size, and 
+main memory latency.
+Instruction caches are not measured.
+TLB miss latency can also be measured, as in 
+.RN Saavedra92 ,
+but we stopped at main memory.  Measuring TLB miss time is problematic
+because different systems map different amounts of memory with their 
+TLB hardware.
+.LP
+The benchmark varies two parameters, array size and array stride.  
+For each size, a list of pointers is created for all of the different 
+strides.  Then the list is walked thus:
+.DS
+.ft CW
+mov  r4,(r4)  # C code: p = *p;
+.ft
+.DE
+The time to do about 1,000,000 loads (the list wraps) is measured and
+reported.  The time reported is pure latency time and may be zero even though
+the load instruction does not execute in zero time.  Zero is defined as one
+clock cycle; in other words, the time reported is \fBonly\fP memory latency
+time, as it does not include the instruction execution time.  It is assumed
+that all processors can do a load instruction in one processor cycle 
+(not counting stalls).  In other words, if the processor cache load time
+is 60ns on a 20ns processor, the load latency reported
+would be 40ns, the additional 20ns is for the load instruction
+itself.\**
+.FS
+In retrospect, this was a bad idea because we calculate the clock
+rate to get the instruction execution time.  If the clock rate is off,
+so is the load time.
+.FE
+Processors that can manage to get the load address out to the 
+address pins before the end of the load cycle get some free time in this
+benchmark (we don't know of any processors that do that).
+.LP
+This benchmark has been validated by logic analyzer measurements
+on an \s-1SGI\s0 Indy by Ron Minnich while he was at the Maryland Supercomputer
+Research Center.
+.TSTART 1
+.so mem.pic
+.FEND "Memory latency" 1
+.LP
+Results from the memory latency benchmark are plotted as a series of data sets
+as shown in Figure \n[FIGURE].
+Each data set represents a stride size,
+with the array size varying from 512 bytes up to 8M or more.
+The curves contain a series of 
+horizontal plateaus, where each plateau represents a level in the
+memory hierarchy.  
+The point where each plateau ends and the line rises marks the
+end of that portion of the memory hierarchy (e.g., external cache).  
+Most machines have similar memory hierarchies:
+on-board cache, external cache, main memory, and main memory plus TLB
+miss costs.  
+There are variations: some processors are missing a cache, while 
+others add another cache to the hierarchy.
+.\" XXX Larry please double-check this; I am going on dim memory...
+For example, the Alpha 8400 has two on-board caches, one 8K
+and the other 96K.
+.LP
+The cache line size can be derived by comparing curves and noticing which
+strides are faster than main memory times.  The smallest stride that is
+the same as main memory speed is likely to be the cache line size because
+the strides that are faster than memory are
+getting more than one hit per cache line.  
+.\" Prefetching may confuse
+.\" the issue because a demand read may stall behind a prefetch load,
+.\" causing cache lines to appear twice as large as they are.
+.\" XXX
+.\" Larry --- can we use prime modulus arithmetic to set up pointer 
+.\" loops which might appear random but which really aren't and which
+.\" hit every stride once before looping?
+.\"
+.\" XXX
+.\" Larry --- is there any way we can defeat/disable prefetching
+.\" so the cache line size can be more accurately determined?
+.\"
+.\" XXX
+.\" Larry --- can we create a benchmark for TLB misses?
+.\" I think it was Tom Rokicki who suggested that we create a
+.\" benchmark where the data fits in the cache, but the pages don't
+.\" fit in the TLB.  
+.\"
+.\" XXX
+.\" Larry --- is the description of the memory hierarchy correct?
+.\" I am not sure I haven't added an extra level of external cache...
+.EQ
+delim $$
+.EN
+.LP
+Figure \n[FIGURE] shows memory latencies on a nicely made machine,
+a \s-1DEC\s0 Alpha.
+We use this machine as the example 
+because it shows the latencies and sizes of
+the on-chip level 1 and motherboard level 2 caches, and because it
+has good all-around numbers, especially considering it can support a
+4M level 2 cache.
+The on-board cache is $2 sup 13$ bytes or 8K, while the
+external cache is $2 sup 19$ bytes or 512K.
+.EQ
+delim off
+.EN
+.TSTART
+.so lat_allmem.tbl
+.TEND "Cache and memory latency (ns)"
+.nr MEMTABLE \n[TABLE]
+.LP
+Table \n[TABLE] shows the cache size, cache latency, and main memory
+latency as extracted from the memory latency graphs.  
+The graphs and the tools for extracting the data are 
+included with \*[lmbench].  
+It is worthwhile to plot all of the graphs and examine them since the
+table is missing some details, such as the
+\s-1DEC\s0 Alpha 8400 processor's second 96K on-chip cache.
+.LP
+We sorted Table \n[TABLE] on level 2 cache latency because we think
+that many applications will fit in the level 2 cache.  The HP and IBM
+systems have only one level of cache so we count that as both level 1
+and level 2.  Those two systems have remarkable cache performance for
+caches of that size.  In both cases, the cache delivers data in one
+clock cycle after the load instruction.  
+.LP
+HP systems usually focus on
+large caches as close as possible to the processor.  A older HP
+multiprocessor system, the 9000/890, has a 4M, split I&D, direct mapped
+cache with a 2K victim cache, accessible in one clock (16ns).\**  That system is
+primarily a database server.  
+.FS
+The Usenix version of this paper had this as a set associate cache; that was
+incorrect.
+.FE
+.LP
+The IBM focus is on low latency, high
+bandwidth memory.  The IBM memory subsystem is good because all of
+memory is close to the processor, but has the weakness that it is
+extremely difficult to evolve the design to a multiprocessor system.
+.LP 
+The 586 and PowerPC motherboards have quite poor second level caches,  
+the caches are not substantially better than main memory.
+.LP
+The Pentium Pro and Sun Ultra second level caches are of medium speed 
+at 5-6 clocks latency each.  5-6 clocks seems fast until it is compared
+against the HP and IBM one cycle latency caches of similar size.  
+Given the tight integration of the Pentium Pro level 2 cache, it is 
+surprising that it has such high latencies.
+.LP
+The 300Mhz DEC Alpha has a rather high 22 clock latency to the second
+level cache which is probably one of the reasons that they needed a 96K
+level 1.5 cache.  SGI and DEC have used large second level caches
+to hide their long latency from main memory.
+.LP
+.NH 2
+Operating system entry
+.LP
+Entry into the operating system is required for many system facilities.  
+When calculating the cost of a facility, it is useful to know how
+expensive it is to perform a nontrivial entry into the operating system.
+.LP
+We measure nontrivial entry into the system by repeatedly writing one
+word to \f(CW/dev/null\fP, a pseudo device driver that does nothing but
+discard the data.   This particular entry point was chosen because it has
+never been optimized in any system that we have measured.  Other entry
+points, typically \*[getpid] and \*[gettimeofday], are heavily used,
+heavily optimized, and sometimes implemented as user-level library
+routines rather than system calls.  
+A write to the \f(CW/dev/null\fP driver will go
+through the system call table to \*[write], verify the user area as
+readable, look up the file descriptor to get the vnode, call the vnode's
+write function, and then return.  
+.TSTART
+.so lat_nullsys.tbl
+.TEND "Simple system call time (microseconds)"
+.LP
+Linux is the clear winner in the system call time.  The reasons are
+twofold: Linux is a uniprocessor operating system, without any
+MP overhead, and Linux is a small operating system, without all
+of the ``features'' accumulated by the commercial offers.
+.LP
+Unixware and Solaris are doing quite well, given that they are both fairly
+large, commercially oriented operating systems with a large accumulation
+of ``features.''
+.NH 2
+Signal handling cost
+.LP
+Signals in Unix are a way to tell another process to handle an event.  They
+are to processes as interrupts are to the CPU.
+.LP
+Signal handling is often critical to layered systems.  Some applications,
+such as databases, software development environments, and threading libraries
+provide an operating system-like layer on top of the operating system,
+making signal handling a critical path in many of these applications.
+.LP
+\*[lmbench] measure both signal installation and signal dispatching in two separate
+loops, within the context of one process.
+It measures signal handling by installing a signal handler and then repeatedly
+sending itself the signal.   
+.TSTART
+.so lat_signal.tbl
+.TEND "Signal times (microseconds)"
+.LP
+Table \n[TABLE] shows the signal handling costs.  
+Note that there are no context switches in this benchmark; the signal goes
+to the same process that generated the signal.  In real applications,
+the signals usually go to another process, which implies
+that the true cost of sending that signal is the signal overhead plus the
+context switch overhead.  We wanted to measure signal and context
+switch overheads separately since context
+switch times vary widely among operating systems.
+.LP
+SGI does very well on signal processing,
+especially since their hardware is of an older generation than
+many of the others.  
+.LP
+The Linux/Alpha signal handling numbers are so poor
+that we suspect that this is a bug, especially given that the Linux/x86
+numbers are quite reasonable.
+.NH 2
+Process creation costs
+.LP
+Process benchmarks are used to measure the basic process primitives,
+such as creating a new process, running a different program, and context
+switching.  Process creation benchmarks are of particular interest
+in distributed systems since many remote operations include the creation
+of a remote process to shepherd the remote operation to completion.
+Context switching is important for the same reasons.
+.BU "Simple process creation" .
+The Unix process creation primitive is \*[fork], which
+creates a (virtually) exact copy of the calling process.
+Unlike VMS and some other operating systems, Unix starts any new process
+with a \*[fork].  
+Consequently, \*[fork] and/or \f(CWexecve\fP should be fast and
+``light,'' facts that many have been ignoring for some time.
+.LP
+\*[lmbench] measures simple process creation by creating a process 
+and immediately
+exiting the child process.  The parent process waits for the child
+process to exit.   
+The benchmark is intended to measure the overhead for creating a 
+new thread of control, so it includes the \*[fork] and
+the \*[exit] time.
+.LP
+The benchmark also includes a \f(CWwait\fP system call in the parent and
+context switches from the parent to the child and back again.   Given that
+context switches of this sort are on the order of 20 microseconds and a 
+system call is on the order of 5 microseconds, and that the entire benchmark
+time is on the order of a millisecond or more, the extra overhead
+is insignificant.
+Note that even this relatively simple task is very expensive and is
+measured in milliseconds while most of the other operations we consider are
+measured in microseconds.  
+.BU "New process creation" .
+The preceding benchmark did not create a new application; it created a
+copy of the old application.   This benchmark measures the cost of creating a
+new process and changing that process into a new application, which.  
+forms the basis of every Unix command
+line interface, or shell.  
+\*[lmbench] measures this facility by forking a new child and having that child
+execute a new program \(em in this case, a tiny program that prints 
+``hello world'' and exits.
+.LP
+The startup cost is especially noticeable
+on (some) systems that have shared libraries.  Shared libraries can
+introduce a substantial (tens of milliseconds) startup cost.  
+.\" XXX - statically linked example?
+.TSTART
+.so lat_allproc.tbl
+.TEND "Process creation time (milliseconds)"
+.BU "Complicated new process creation" .
+When programs start other programs, they frequently use one of
+three standard interfaces: \*[popen], \*[system], and/or \*[execlp].  The first
+two interfaces start a new process by invoking the standard command
+interpreter, \f(CW/bin/sh\fP, to start the process.  Starting programs this way
+guarantees that the shell will look for the requested application
+in all of the places that the user would look \(em in other words, the shell
+uses the user's $PATH variable as a list of places to find the
+application.  \*[execlp] is a C library routine which also looks for the
+program using the user's $PATH variable.
+.LP
+Since this is a common way of starting applications, we felt it
+was useful to show the costs of the generality.
+.LP
+We measure this by starting \f(CW/bin/sh\fP to start the same tiny 
+program we ran in the last case.
+In Table \n[TABLE] the cost of asking the shell to go 
+look for the program is
+quite large, frequently ten times as expensive as just creating a 
+new process, and four times as expensive as explicitly naming the location
+of the new program.
+.LP
+The results that stand out in Table \n[TABLE] are the poor Sun Ultra 1 results.
+Given that the processor is one of the fastest, the problem is likely to be
+software.  There is room for substantial improvement in the Solaris
+process creation code.
+.NH 2
+Context switching
+.LP
+Context switch time is defined here as 
+the time needed to save the state of one process and restore the state
+of another process.  
+.LP
+Context switches are frequently in the critical performance path of 
+distributed applications.  For example, the multiprocessor versions
+of the IRIX operating system use
+processes to move data through the networking stack.  This means that the
+processing time for each new packet arriving at an idle system includes
+the time needed to switch in the networking process.  
+.LP
+Typical context switch benchmarks measure just the minimal context switch
+time \(em the time to switch between two processes that are doing nothing
+but context switching.  We feel that this is
+misleading because there are frequently more than two active processes,
+and they usually have a larger working set (cache footprint)
+than the benchmark processes.
+.LP
+Other benchmarks frequently include the cost of 
+the system calls needed to force the context switches.  
+For example, Ousterhout's context switch benchmark 
+measures context switch time plus a \*[read] and a \*[write]
+on a pipe.  
+In many of the systems measured by \*[lmbench], the pipe overhead 
+varies between 30% and 300% of the context switch time, so we were 
+careful to factor out the pipe overhead.
+.BU "Number of processes."
+The context switch benchmark is implemented as 
+a ring of two to twenty processes that are connected with Unix pipes.  
+A token is passed from process to process, forcing context switches.  
+The benchmark measures the time needed to pass
+the token two thousand times from process to process.  
+Each transfer of the token has two costs: the context switch, and
+the overhead of passing the token.
+In order to calculate just the context switching time, the benchmark first
+measures the cost of passing the token through a ring of pipes in a
+single process.  This overhead time is defined as the cost of passing
+the token and is not included in the reported context switch time.
+.BU "Size of processes."
+In order to measure more realistic context switch times, we add
+an artificial variable size ``cache footprint'' to the switching
+processes.  The cost of the context switch then includes the cost
+of restoring user-level state (cache footprint).  The cache footprint
+is implemented by having the process allocate an array of data\**
+.FS
+All arrays are at the same virtual
+address in all processes.
+.FE
+and sum
+the array as a series of integers after receiving the token but before
+passing the token to the next process.  Since most systems will cache data
+across context switches, the working set for the benchmark is slightly 
+larger than the number of processes times the array size.  
+.LP
+It is worthwhile to point out that the overhead mentioned above
+also includes the cost of accessing the data, in the same way as
+the actual benchmark.   However, because the overhead is measured
+in a single process, the cost is typically the cost with ``hot''
+caches.  In the Figure 2, each size is plotted as a line, with
+context switch times on the Y axis, number of processes on the 
+X axis, and the process size as the data set.
+The process size and the hot cache overhead costs for
+the pipe read/writes and any data access is what is labeled
+as \f(CWsize=0KB overhead=10\fP.  The size is in kilobytes and the overhead
+is in microseconds.
+.LP
+The context switch time does not include anything other than
+the context switch, provided that all the benchmark processes fit in the
+cache.  If the total size of all of the benchmark processes is larger
+than the cache size,  the cost of each context switch will include cache
+misses.
+We are trying to show realistic context switch times as a
+function of both size and number of processes.
+.TSTART 1
+.so ctx.pic
+.FEND "Context switch times" 1
+.LP
+Results for an Intel Pentium Pro system running Linux at 167 MHz are
+shown in Figure \n[FIGURE].
+The data points on the figure are labeled with the working set
+due to the sum of data in all of the processes.  The actual working set is
+larger, as it includes the process and kernel overhead as well.  
+One would expect the context switch times to stay constant until 
+the working set is
+approximately the size of the second level cache.  The Intel system has a 
+256K second level cache, and the context switch times 
+stay almost constant until about 256K (marked as .25M in the graph).
+.BU "Cache issues"
+The context switch benchmark is a deliberate measurement of the
+effectiveness of the caches across process context switches.  If the
+cache does not include the process identifier (PID, also sometimes
+called an address space identifier) as part of the address, then the
+cache must be flushed on every context switch.  If the cache does not map
+the same virtual addresses from different processes to different cache
+lines, then the cache will appear to be flushed on every context
+switch.
+.LP
+If the caches do
+not cache across context switches there would be no grouping at the
+lower left corner of Figure \n[FIGURE], instead, the graph would
+appear as a series of straight, horizontal, parallel lines.  The number
+of processes will not matter, the two process case will be just as bad
+as the twenty process case since the cache would not be
+useful across context switches.
+.TSTART
+.so ctx.tbl
+.TEND "Context switch time (microseconds)"
+.LP
+We picked four points on the graph and extracted those values for Table
+\n[TABLE].  The complete set of values, as well as tools to graph them,
+are included with \*[lmbench].
+.LP
+Note that multiprocessor context switch times are frequently more expensive
+than uniprocessor context switch times.  This is because multiprocessor
+operating systems tend to have very complicated scheduling code. 
+We believe that multiprocessor context switch times can be, and should be,
+within 10% of the uniprocessor times.
+.LP
+Linux does quite well on context switching, especially on the more
+recent architectures.  By comparing the Linux 2 0K processes to the
+Linux 2 32K processes, it is apparent that there is something wrong
+with the Linux/i586 case.  If we look back to Table \n[MEMTABLE], we can
+find at least part of the cause.  The second level cache latency for the
+i586 is substantially worse than either the i686 or the Alpha.  
+.LP
+Given the poor second level cache behavior of the PowerPC, it is surprising
+that it does so well on context switches, especially the larger sized cases.
+.LP
+The Sun Ultra1 context switches quite well in part because of enhancements
+to the register window handling in SPARC V9.
+.NH 2
+Interprocess communication latencies
+.LP
+Interprocess communication latency is important because many operations
+are control messages to another process (frequently on another
+system).  The time to tell the remote process to
+do something is pure overhead and is frequently in the critical path
+of important functions such as distributed applications (e.g.,
+databases, network servers).
+.LP
+The interprocess communication latency benchmarks typically have the 
+following form: pass a small message (a byte or so) back and forth between two
+processes.  The reported results are always the microseconds needed
+to do one round trip.  For one way timing, 
+about half the round trip is right.  However, the CPU cycles tend to be
+somewhat asymmetric for one trip: receiving is typically more
+expensive than sending.  
+.BU "Pipe latency" .
+Unix pipes are an interprocess communication mechanism implemented as 
+a one-way byte stream.  Each end of the stream has an associated file
+descriptor; one is the write descriptor and the other the read
+descriptor.
+.LP
+Pipes are frequently used as a local IPC mechanism.  Because of the 
+simplicity of pipes, they are frequently the fastest portable 
+communication mechanism.
+.LP
+Pipe latency is measured by creating a pair of pipes, forking a child process,
+and passing a word back and forth.  This benchmark is identical to the 
+two-process, zero-sized context switch benchmark, except that it includes
+both the context switching time and the pipe overhead in the results.
+.nr NTABLE \n[TABLE]+1
+.nr LTABLE \n[TABLE]
+Table \n[NTABLE] shows the round trip latency from process A to process B
+and back to process A.
+.TSTART
+.so lat_pipe.tbl
+.TEND "Pipe latency (microseconds)"
+.LP
+The time can be broken down to two context switches plus four system calls
+plus the pipe overhead.  The context switch component is two of the small
+processes in Table \n[LTABLE].
+This benchmark is identical to the context switch benchmark in
+.RN Ousterhout90 .
+.BU "TCP and RPC/TCP latency" .
+TCP sockets may be viewed as an interprocess communication mechanism similar
+to pipes with the added feature that TCP sockets work across machine 
+boundaries.
+.LP
+TCP and RPC/TCP connections are frequently used in low-bandwidth, 
+latency-sensitive applications.  The default Oracle distributed 
+lock manager uses TCP sockets, and the locks per second available 
+from this service are accurately modeled by the TCP latency test.
+.TSTART
+.so lat_tcp.tbl
+.TEND "TCP latency (microseconds)"
+.LP
+Sun's RPC is layered either over TCP or over UDP.
+The RPC layer is responsible for managing connections (the port mapper), 
+managing different byte orders and word sizes (XDR), and implementing a 
+remote procedure call abstraction.
+Table \n[TABLE] shows the same benchmark with and
+without the RPC layer to show the cost of the RPC implementation.
+.LP
+TCP latency is measured by having a server process that waits for connections
+and a client process that connects to the server.  The two processes then
+exchange a word between them in a loop.  The latency reported is one 
+round-trip time.  The measurements in Table \n[TABLE] are local 
+or loopback measurements, 
+since our intent is to show the overhead of the software.  The same benchmark
+may be, and frequently is, used to measure host-to-host latency.
+.LP
+Note that the RPC layer frequently adds hundreds of microseconds of
+additional latency.  The problem is not the external data
+representation (XDR) layer \(em the
+data being passed back and forth is a byte, so there is no XDR to be done.
+There is no justification for the extra cost; it is simply
+an expensive implementation.  DCE RPC is worse.
+.TSTART
+.so lat_udp.tbl
+.TEND "UDP latency (microseconds)"
+.BU "UDP and RPC/UDP latency" .
+UDP sockets are an alternative to TCP sockets.  They differ in that UDP
+sockets are unreliable messages that leave the retransmission issues to
+the application.  UDP sockets have a few advantages, however.  They preserve
+message boundaries, whereas TCP does not; and a single UDP socket may 
+send messages
+to any number of other sockets, whereas TCP sends data to only one place.
+.LP
+UDP and RPC/UDP messages are commonly used in many client/server applications.
+NFS is probably the most widely used RPC/UDP application in the world.
+.LP
+Like TCP latency, UDP latency is measured by having a server process 
+that waits for connections
+and a client process that connects to the server.  The two processes then
+exchange a word between them in a loop.  The latency reported is round-trip
+time.  The measurements in Table \n[TABLE] are local or loopback measurements,
+since our intent is to show the overhead of the software.
+Again, note that the RPC library can add hundreds of microseconds of extra
+latency.  
+.\" .LP
+.\" It is interesting to compare UDP latency with TCP latency.  In many cases the
+.\" TCP latency is \fBless\fP than the UDP latency.  This flies in the face
+.\" of conventional wisdom, which says that TCP is an inherently more expensive
+.\" protocol than UDP.  The reasons that TCP may appear faster are: in this
+.\" benchmark, the protocol costs are dwarfed by the other costs (context
+.\" switching, system calls, and driver overhead); and TCP is frequently
+.\" hand-tuned for performance, while UDP is rarely hand-tuned.
+.TSTART
+.so lat_ipc.tbl
+.TEND "Remote latencies (microseconds)"
+.BU "Network latency" .
+We have a few results for over the wire latency included in Table \n[TABLE].
+As might be expected, the most heavily used network interfaces (i.e., ethernet)
+have the lowest latencies.  The times shown include the time on the wire,
+which is about 130 microseconds for 10Mbit ethernet, 13 microseconds for 100Mbit
+ethernet and FDDI, and less than 10 microseconds for Hippi.
+.BU "TCP connection latency" .
+TCP is a connection-based, reliable, byte-stream-oriented protocol.  As
+part of this reliability, a connection must be established before any
+data can be transferred.  The connection is accomplished by a ``three-way
+handshake,'' an exchange of packets when the client attempts to connect
+to the server.
+.LP
+Unlike UDP, where no connection is established, TCP sends packets
+at startup time.  If an application creates a TCP connection to send
+one message, then the startup time can be a substantial
+fraction of the total connection and transfer costs.   
+The benchmark shows that the connection cost is approximately half of
+the cost.  
+.LP
+Connection cost is measured by having a server, registered using
+the port mapper, waiting for connections.  The client figures out where the
+server is registered and then repeatedly times a \*[connect] system call to
+the server.  The socket is closed after each connect.  Twenty connects
+are completed and the fastest of them is used as the result.  The time measured
+will include two of the three packets that make up the three way TCP handshake,
+so the cost is actually greater than the times listed.
+.\" XXX Larry --- if a machine's clock granularity is on the order of
+.\" 10 milliseconds, won't this benchmark run into granularity problems?
+.TSTART
+.so lat_connect.tbl
+.TEND "TCP connect latency (microseconds)"
+.LP
+Table \n[TABLE] shows that if the need is to send
+a quick message to another process, given that most packets get through,
+a UDP message will cost a \f(CWsend\fP and a \f(CWreply\fP (if positive
+acknowledgments are needed, which they are in order to have an apples-to-apples 
+comparison with TCP).  If the transmission medium is 10Mbit Ethernet, the
+time on the wire will be approximately 65 microseconds each way, or 130
+microseconds total.  To do the same thing with a short-lived TCP 
+connection would cost 896 microseconds of wire time alone.
+.LP
+The comparison is not meant to disparage TCP; TCP is a useful protocol.  Nor
+is the point to suggest that all messages should be UDP.  In many cases,
+the difference between 130 microseconds and 900 microseconds is
+insignificant compared with other aspects of application performance.
+However, if the application is very latency sensitive
+and the transmission medium is slow (such as serial link or a message
+through many routers), then a UDP message may prove cheaper.
+.NH 2 
+File system latency
+.LP
+File system latency is defined as the time required to create or delete
+a zero length file.  
+We define it this way because in many file systems,
+such as the BSD fast file system, the directory operations are done
+synchronously in order to maintain on-disk integrity.  Since the 
+file data is typically cached and sent to disk at some later date,
+the file creation and deletion become the bottleneck
+seen by an application.  This bottleneck is substantial: to do
+a synchronous update to a disk is a matter of tens of milliseconds.
+In many cases, this bottleneck is much more of a perceived performance
+issue than processor speed.
+.LP
+The benchmark creates 1,000 zero-sized files and then deletes them.
+All the files are created in one directory and their names are
+short, such as "a", "b", "c", ... "aa", "ab", ....
+.TSTART
+.so lat_fs.tbl
+.TEND "File system latency (microseconds)"
+.LP
+The create and delete latencies are shown in Table \n[TABLE].
+Notice that Linux does extremely well here, 2 to 3 orders of magnitude faster
+than the slowest systems.  However, Linux does not guarantee
+anything about the disk integrity; the directory operations are done in
+memory.  Other fast systems, such as SGI's XFS, use a log to guarantee the 
+file system integrity.
+The slower systems, all those with ~10 millisecond file latencies, are
+using synchronous writes to guarantee the file system integrity.
+Unless Unixware has modified UFS substantially, they must be running in
+an unsafe mode since the FreeBSD UFS is much slower and both file
+systems are basically the 4BSD fast file system.
+.NH 2
+Disk latency
+.\" XXX - either get more results for this benchmark or delete it.
+.\" I'd really like to not delete it - lmdd is probably the most
+.\" useful tool and it gets the least press.
+.LP
+Included with \*[lmbench] is a small benchmarking program useful for
+measuring disk and file I/O.  \*[lmdd], which is patterned after 
+the Unix utility \f(CWdd\fP, measures both sequential and random I/O,
+optionally generates patterns on output and checks them on input, 
+supports flushing the data from the buffer cache on systems that 
+support \f(CWmsync\fP, and has a very flexible user interface.  
+Many I/O benchmarks can be trivially replaced with a \f(CWperl\fP script
+wrapped around \*[lmdd].  
+.LP
+While we could have generated both sequential and random I/O results as
+part of this paper, we did not because those benchmarks are heavily
+influenced by the performance of the disk drives used in the test.  We
+intentionally measure only the system overhead of a SCSI command since
+that overhead may become a bottleneck in large database configurations.
+.LP
+Some important applications, such as transaction processing, are
+limited by random disk IO latency.  
+Administrators can increase the number of disk operations per
+second by buying more disks, until the processor overhead becomes
+the bottleneck.
+The \*[lmdd] benchmark measures the processor overhead associated with each
+disk operation, and it can provide an upper bound on the number of
+disk operations the processor can support.
+It is designed for SCSI disks, and it assumes that most
+disks have 32-128K read-ahead buffers and that they can read ahead
+faster than the processor can request the chunks of data.\**
+.FS
+This may not always be true: a processor could be fast enough to make the
+requests faster than the rotating disk.  
+If we take 6M/second to be disk
+speed, and divide that by 512 (the minimum transfer size), that is 12,288 IOs/second, or
+81 microseconds/IO.  We don't know of any processor/OS/IO controller
+combinations that can do an IO in 81 microseconds.
+.FE
+.LP
+The benchmark simulates a large number of disks by reading 512byte
+transfers sequentially from the raw disk device (raw disks are unbuffered
+and are not read ahead by Unix).  
+Since the disk can read ahead faster than the system can request
+data, the benchmark is doing small transfers of data from the
+disk's track buffer.  
+Another way to look at this is that the benchmark 
+is doing memory-to-memory transfers across a SCSI channel.
+It is possible to generate loads of more than 1,000 SCSI
+operations/second on a single SCSI disk.  For comparison, disks under
+database load typically run at 20-80 operations per second.
+.TSTART
+.so lat_disk.tbl
+.TEND "SCSI I/O overhead (microseconds)"
+.LP
+The resulting overhead number represents a 
+\fBlower\fP bound on the overhead of a disk I/O.  
+The real overhead numbers will be higher on SCSI systems because
+most SCSI controllers will not disconnect if the request can be
+satisfied immediately.
+During the benchmark, the processor simply sends the request and
+transfers the data, while 
+during normal operation, the processor will send the request,
+disconnect, get interrupted, reconnect, and transfer the data.
+.LP
+This technique can be used to discover how many drives a system can support
+before the system becomes CPU-limited because it can produce the
+overhead load of a fully configured system with just a few disks.
+.NH 2
+Parallelism
+.LP
+description of parallelism benchmarks with sample results.
+.NH 2
+Other benchmarks
+.LP
+description of other benchmarks with sample results.
+.NH 1
+Scaling Benchmarks
+.LP
+There are a number of issues associated with converting
+single-process benchmarks with a single process to 
+scalable benchmarks with several independent processes,
+in addition to the various issues addressed by
+the timing harness.
+Many of the benchmarks consume or utilize system
+resources, such as memory or network bandwidth,
+and a careful assessment of the likely resource
+contention issues is necessary to ensure that the
+benchmarks measure important aspects of system performance
+and not artifacts of artificial resource contention.
+.LP
+For example, the Linux 2.2 kernel uses a single lock to
+control access to the kernel data structures for a file.
+This means that multiple processes accessing that file
+will have their operations serialized by that lock.
+.NH 2
+File System
+.LP
+A number of the benchmarks measure aspects of file system
+performance, such as \*[bw_file_rd], \*[bw_mmap_rd], 
+\*[lat_mmap], and \*[lat_pagefault].
+It is not immediately apparent how these benchmarks should
+be extended to the parallel domain.  For example, it may
+be important to know how file system performance scales
+when multiple processes are reading the same file, or
+when multiple processes are reading different files.
+The first case might be important for large, distributed 
+scientific calculations, while the second might be more 
+important for a web server.
+.LP
+However, for the operating system, the two cases are
+significantly different.  When multiple processes
+access the same file, access to the kernel data 
+structures for that file must be coordinated and
+so contention and locking of those structures can
+impact performance, while this is less true when
+multiple processes access different files.
+.LP
+In addition, there are any number of issues associated
+with ensuring that the benchmarks are either measuring
+operating system overhead (e.g., that no I/O is actually
+done to disk), or actually measuring the system's I/O
+performance (e.g., that the data cannot be resident in
+the buffer cache).  Especially with file system related
+benchmarks, it is very easy to develop benchmarks that
+compare apples and oranges (e.g., the benchmark includes
+the time to flush data to disk on one system, but only
+includes the time to flush a portion of data to disk on
+another system).
+.LP
+\*[lmbench3] allows the user to measure either case
+as controlled by a command-line switch.  When measuring
+accesses to independent files, the benchmarks first
+create their own private copies of the file, one for
+each child process.  Then each process accesses its
+private file.  When measuring accesses to a single
+file, each child simply uses the designated file
+directly.
+.NH 2
+Context Switching
+.LP
+Measuring context switching accurately is a difficult
+task.  \*[lmbench1] and \*[lmbench2] measured context
+switch times via a "hot-potato" approach using pipes
+connected in a ring.  However, this experimental
+design heavily favors schedulers that do "hand-off"
+scheduling, since at most one process is active at
+a time.
+Consequently, it is not really a good benchmark
+for measuring scheduler overhead in multi-processor
+machines.
+.LP
+The design and methodology for measuring context
+switching and scheduler overhead need to be revisited
+so that it can more accurately measure performance
+for multi-processor machines.
+.NH 1
+New Benchmarks
+.LP
+\*[lmbench3] also includes a number of
+new benchmarks.
+.NH 2
+Stream
+.LP
+\*[lmbench3] includes a new micro-benchmark which
+measures the performance of John McCalpin's \*[stream]
+benchmark kernels for \*[stream] versions 1 and 2.
+This benchmark faithfully recreates each of the
+kernel operations from both \*[stream] benchmarks,
+and because of the powerful new timing harness it
+can easily measure memory system scalability.
+.TSTART 1
+.TS
+center box tab (|);
+c s s s s
+c | c | c s | c
+l | l | l | l | l .
+Stream
+_
+Kernel|Code|Bytes|FL
+||rd|wr|OPS
+_
+COPY|$a[i]=b[i]$|8(+8)|8|0
+SCALE|$a[i]=q times b[i]$|8(+8)|8|1
+ADD|$a[i]=b[i]+c[i]$|16(+8)|8|1
+TRIAD|$a[i]=b[i]+q times c[i]$|16(+8)|8|2
+.TE
+.TS
+center box tab (|);
+c s s s s
+c | c | c s | c
+l | l | l | l | l .
+Stream2
+_
+Kernel|Code|Bytes|FL
+||rd|wr|OPS
+_
+FILL|$a[i]=q$|0(+8)|8|0
+COPY|$a[i]=b[i]$|8(+8)|8|0
+DAXPY|$a[i]=a[i]+q times b[i]$|16|8|2
+SUM|$sum=sum + a[i]$|8|0|1
+.TE
+.TEND "Stream operations"
+.LP
+Table \n[TABLE] shows the four kernels for each version
+of the \*[stream] benchmark.  Note that the
+.I read
+columns include numbers in parenthesis, which
+represent the average number of bytes read into 
+the cache as a result of the write to that
+variable.  Cache lines are almost invariably
+bigger than a single double, and so when a
+write miss occurs the cache will read the line
+from memory and then modify the selected bytes.
+Sometimes vector instructions such as SSE
+and 3DNow can avoid this load by writing an 
+entire cache line at once.
+.NH 2
+Basic operation latency
+.LP
+\*[lmbench3] includes a new micro-benchmark 
+which measures the latency for a variety of basic
+operations, such as addition, multiplication, and
+division of integer, float, and double operands.
+To measure the basic operation latency we construct
+a basic arithmetic statement containing the desired
+operands and operations.  This statement is repeated
+one hundred times and these repetitions are then
+embedded in a loop.  
+.TSTART
+.TS
+center box tab (&);
+c c c
+l & l & l .
+Operand&Operation&Statement
+_
+int&$bit$&r^=i;s^=r;r|=s;
+&$add$&a+=b;b-=a;
+&$mul$&r=(r*i)^r;
+&$div$&r=(r/i)^r;
+&$mod$&r=(r%i)^r;
+_
+float&$add$&f+=f;
+&$mul$&f*=f;
+&$div$&f=g/f;
+_
+double&$add$&f+=f;
+&$mul$&f*=f;
+&$div$&f=g/f;
+.TE
+.TEND "lat_ops statements"
+.LP
+Table \n[TABLE] shows the data type and expressions
+used for each basic operation type.  The variable
+$i$ indicates the integer loop variable and generally
+changes every ten or hundred evaluations of the
+basic expression.  All other variables are of
+the basic type being measured, and aside from
+being modified by the relevant expressions are
+only initialized once at the beginning of the
+benchmark routine.
+.LP
+Each statement has been designed to ensure that
+the statement instances are \fIinterlocked\fR,
+namely that the processor cannot begin processing
+the next instance of the statement until it has
+completed processing the previous instance.  This
+property is crucial to the correct measurement of
+operation latency.
+.LP
+One important consideration in the design of
+the statements was that they not be optimized
+out of the loop by intelligent compilers.  
+Since the statements are repeated one hundred
+times, the compiler has the option of evaluating
+the sequence of one hundred repetitions of the
+same statement, and sometimes it can find
+optimizations that are not immediately 
+apparent.  For example, the integer statement
+$a=a+a;$ when repeated one hundred times in
+a loop can be replaced with the single statement
+$a=0;$ because the statement $a=a+a;$ is equivalent
+to $a< < =1;$, and one hundred repetitions of that
+statement is equivalent to $a< < =100;$, which for
+32bit (or even 64bit) integers is equivalent to
+$a=0;$.  
+.LP
+It is relatively easy to identify floating
+point statements that interlock, are not
+optimized away, and that only use the operation
+of interest.
+It is much harder to identify integer statements
+meeting the same criterion.  All simple 
+integer bitwise operations can either be optimized
+away, don't interlock, or use operations other
+than one of interest.
+We chose to add operations other than the 
+operation(s) of interest to the statements.
+.LP
+The integer $mul$, $div$, and $mod$ statements all 
+include an added $xor$ operation which prevents
+(current) compilers from optimizing the statements
+away.  Since the $xor$ operation is generally
+completed in a single clock tick, and since
+we can measure the $xor$ operation latency
+separately and subtract that overhead, we can
+still measure the latencies of the other 
+operations of interest.
+.LP
+It is not possible to measure latency for 64bit
+operations on 32bit machines because most
+implementations allow operations on the upper
+and lower bits to overlap.  This means that
+on most 32bit machines, the measured latency
+would appear to be a non-integral multiple of
+the basic clock cycle.  For example, in the
+$add$ statement, the system could first add
+the two lower words.  Then, in parallel it
+could both add the two upper words (along with
+the carry from the lower words), and compute
+the $xor$ of the lower word.  Finally, it
+can overlap the $xor$ of the upper word
+with the addition of the two lower words from
+the next instantiation of the statement.
+.TSTART
+.TS
+center box tab (&);
+c c c c c
+c c c c c
+l & l & r & r & r .
+Operand&Op&HPPA2.0&PIII&AMD
+&&400MHz&667MHz&1.3GHz
+_
+mhz&&2.50&1.50&0.75
+int&$bit$&2.53&1.50&0.75
+&$add$&2.50&1.51&0.75
+&$mul$&14.52&6.07&3.03
+&$div$&109.40&58.52&30.86
+&$mod$&75.14&65.01&32.59
+_
+float&$add$&7.54&4.58&3.0
+&$mul$&7.50&7.50&3.0
+&$div$&45.00&35.26&13.21
+_
+double&$add$&7.52&4.53&3.01
+&$mul$&7.52&7.71&3.01
+&$div$&85.01&35.51&13.16
+.TE
+.TEND "lat_ops results (ns)"
+.LP
+Table \n[TABLE] contains some sample results
+for two processors.  
+It does contain one result which is slightly
+surprising unless you are familiar with the
+PA-RISC architecture: floating point multiply
+and divide are faster than the corresponding
+integer operations!  This is because PA-RISC
+does not contain integer MUL, DIV, or MOD
+instructions and the optimizing compiler
+converts the integers into floating point,
+does the operations in the floating point
+unit, and then converts the result back
+to an integer.
+.NH 2
+Basic operation parallelism
+.LP
+Instruction-level parallelism in commodity processors
+has become commonplace in the last ten years.  
+Modern processors typically have more than one
+operational unit that can be active during a
+given clock cycle, such as an integer arithmetic
+unit and a floating point unit.  In addition, 
+processors may have more than a single instance
+of a given type of operational unit, both of
+which may be active at a given time.  All this
+intra-processor parallelism is used to try and
+reduce the average number of clock cycles per
+executed instruction.  
+.LP
+\*[lmbench3] incorporates a new benchmark \*[par_ops]
+which attempts to quantify the level of available
+instruction-level parallelism provided by the processor.  This 
+benchmark is very similar to \*[lat_ops], and
+in fact uses the same statement kernels, but it
+has been modified and extended.  We create
+different versions of each benchmark; each
+version has $N$ sets of interleaved statements.
+Each set is identical to equivalent \*[lat_ops]
+statements.  In this way multiple independent
+sets can be executing the same operation(s)
+in parallel, if the hardware supports it.
+.LP
+For example, the float $mul$ benchmark to measure
+performance with two parallel streams of statements
+would look like something this:
+.DS
+#define TEN(a) a a a a a a a a a a
+void benchmark_1(iter_t iterations, void* cookie)
+{
+    register iter_t i = iterations;
+    struct _state* state = (struct _state*)cookie;
+    register float f0 = state->float_data[0];
+    register float f1 = state->float_data[1];
+
+    while (i-- > 0) {
+        TEN(f0*=f0; f1*=f1;)
+    }
+    use_int((int)f0);
+    use_int((int)f1);
+}
+.DE
+.LP
+If the processor had two floating point multiply
+units, then both $f0$ and $f1$ multiplies could
+proceed in parallel.
+.LP
+However, there are some potential problems with
+the integer operations, namely the fact that the
+statements contain mixed operations.  In general,
+processors have at least as many integer units
+that can do $xor$ as can do the other operations
+of interest ($mul$, $div$ and $mod$), so the
+inclusion of $xor$ in the statements shouldn't
+be a bottleneck.  
+.LP
+However, since parallelism is measured by comparing 
+the latency of the single-stream with that of 
+multiple interleaved streams, and since the single-stream 
+latency includes the $xor$ latency, the apparent 
+parallelism of $mul$, $div$, $mod$ can be over-stated.
+For example, if a process has one unit that can
+do integer bit operations, such as $xor$, and another
+unit for integer $mul$ operations, then the average
+latency for $a0 = (i * a0) ^ a0$ in the single stream 
+case would be:
+.EQ
+t bar = t sub xor + t sub mul
+.EN
+In the multi-stream case, the execution of the $xor$ 
+operation of one stream can be overlapped with the 
+$mul$ of another stream, so the average latency per 
+stream would simply be $t bar = t sub mul$, assuming 
+that $mul$ operations are not cheaper than $xor$ 
+operations, which results in an apparent parallelism 
+$p tilde$:
+.EQ
+p tilde = {t sub xor + t sub mul} over { t sub mul }
+.EN
+Assuming that $t sub xor < < t sub mul$, this
+still gives a reasonable approximation to
+the correct answer.  Unfortunately, this is
+not always a reasonable assumption.
+.LP
+Of course, if it was known ahead of time that
+$xor$ and { $mul$, $div$, and $mod$ } used
+different execution units, then the benchmark
+could simply subtract $t sub xor$ from the
+baseline measurement.  The difficulty lies
+in determining whether the units overlap
+or not.
+.TSTART
+.TS
+center box tab (&);
+c c c c c
+c c c c c
+l & l & r & r & r .
+Operand&Op&HPPA2.0&PIII&AMD
+&&400MHz&667MHz&1.3GHz
+_
+int&$bit$&1.99&1.70&1.87
+&$add$&1.99&1.61&1.90
+&$mul$&6.64&3.81&2.00
+&$div$&2.81&1.20&1.00
+&$mod$&2.78&1.11&1.03
+_
+float&$add$&5.88&1.00&2.66
+&$mul$&5.86&1.14&2.47
+&$div$&2.12&1.03&1.14
+_
+double&$add$&5.68&1.08&2.49
+&$mul$&5.58&1.00&2.53
+&$div$&2.19&1.03&1.14
+.TE
+.TEND "par_ops results"
+.LP
+.NH 1
+Results
+.LP
+Some sample results
+.LP
+bw_mem_rd performance vs. scaling on an SMP machine
+.LP
+
+.NH 1
+Unscalable benchmarks
+.LP
+There are a number of benchmarks which either
+did not make sense for scalable load, such as
+\*[mhz], or which could not
+be extended to measure scalable load due to
+other constraints, such as \*[lat_connect].
+.LP
+\*[mhz] measures the processor clock speed,
+which is not a scalable feature of the system,
+so it doesn't make any sense to create a
+version of it that measures scalable performance.
+.LP
+More specifically, \*[lat_connect] measures
+the latency of connecting to a TCP socket.
+TCP implementations have a timeout on
+sockets and there is generally a fixed size
+queue for sockets in the TIMEOUT state.  
+This means that once the queue has been 
+filled by a program connecting and closing
+sockets as fast as possible, then all new
+socket connections have to wait TIMEOUT
+seconds.  Needless to say, this gives no
+insight into the latency of socket creation
+per se, but is rather a boring artifact.
+Since the \*[lmbench2] version of the
+benchmark can run for very short periods
+of time, it generally does not run into
+this problem and is able to correctly
+measure TCP connection latency.  
+.LP
+Any scalable version of the benchmark needs 
+each copy to run for at least a second, and 
+there are $N$ copies creating connections as 
+fast as possible, so it would essentially be
+guaranteed to run into the TIMEOUT problem.
+Consequently, \*[lat_connect] was not
+enhanced to measure scalable performance.
+.NH 1
+A brief tutorial on memory design
+.LP
+Nearly all modern, general purpose computers use
+virtual memory with phyically addressed caches.
+As such, there is typically one or more caches
+between the physical memory and the processor,
+and virtual-to-physical address translation
+occurs between the processor and the top-level
+cache.  Cache staging and replacement is done
+in \fIcache line\fR units, which are typically
+several words in length, and caches lower in 
+the hierarchy sometimes have cache lines which
+are larger than those in the higher caches.
+.LP
+Modern processors usually incorporate at least
+an L1 cache on-chip, and some are starting to
+also incorporate the L2 cache on-chip.  In
+addition, most include a translation look-aside
+buffer (TLB) on-chip for fast virtual-to-physical
+address translation.
+.LP
+One key element of any cache design is its
+replacement strategy.  Most caches use either
+direct-mapped or set associative caches.  In
+the first instance any word in physical memory
+has exactly one cache line where into which it
+may be staged, while set associative caches
+allow a given word to be cached into one of a
+set of lines.  Direct-mapped caches have a 
+very simple replacement policy: the contents
+of the line that is needed is discarded.
+Set associative caches usually use LRU or
+some variant within each set, so the least
+recently used line in the set of possible
+cache lines is replaced.  The control logic
+for direct-mapped caches is much cheaper to
+build, but they are generally only as 
+effective as a set-associative cache half
+the size.\**
+.FS 
+See
+.RN Hennessy96
+page 396.
+.FE
+.LP
+Another key element of memory hierarchy design
+is the management of dirty data; at what point
+are writes passed down the memory hierarchy to
+lower caches and main memory?  The two basic
+policies are write-through and write-back.
+A write-through policy means that writes are
+immediately passed through the cache to the
+next level in the hierarchy, so the lower
+levels are updated at the same time as the
+cache.  A write-back policy means that the
+cache line is marked as dirty in the cache,
+and only when the line is ejected from the
+cache is the data passed down the hierarchy.
+Write-through policies are often used in 
+higher (smaller) caches because multi-
+processor systems need to keep a coherent
+view of memory and the writes are often
+propagated to other processors by \fIsnoopy\fR
+caches.
+.LP
+One often overlooked aspect of cache
+performance is cache behavior during
+writes.  Most cache lines contain
+several words, and most instructions
+only update the line a word at a time.
+This means that when the processor
+writes a word to a cache line that is
+not present, the cache will read the
+line from memory before completing the
+write operation.  For \*[bcopy]-like
+operations this means that the overall
+memory bandwidth requirement is actually
+two reads and one write per copied word,
+rather than the expected read and write.
+.LP
+Most modern processors now include some form
+of prefetch in the memory hierarchy.  For
+the most part these are simple systems that
+can recognize fixed strided accesses through
+memory, such as might be seen in many array
+operations.  However, prefetching systems
+appear to be growing in complexity and
+capability.
+.LP
+Additionally, modern memory subsystems can
+usually support multiple outstanding requests;
+the level of parallelism is usually dependent
+on the level of the hierarchy being accessed.
+Top-level caches can sometimes support as 
+many as six or eight outstanding requests,
+while main memory can usually support two
+outstanding requests.  Other elements of 
+the memory hierarchy, such as the TLB, often
+have additional limits on the level of
+achievable parallelism in practice.\**
+.FS 
+For example, if the TLB serializes all
+TLB misses, and if each memory access
+causes a TLB miss, then the memory
+accesses will be serialized even if
+the data was in a cache supporting
+six outstanding requests.
+.FE
+.LP
+For more information and details on memory 
+subsystem design, and computer architecture
+in general, please see
+.RN Hennessy96
+which has an excellent description of these
+and many other issues.
+.NH 1
+Memory analysis
+.LP
+There are a variety of aspects of memory hierarchy design
+that are interesting to a software developer, such as
+the number of caches and their sizes.  In addition, other
+aspects of cache design, such as the line size,
+associativity and parallelism can impact software
+performance and are of potential interest to software
+developers.
+.LP
+The problem is designing a portable ANSI-C program to
+infer the cache parameters.  A number of operating
+systems have hooks to report at least certain aspects
+of cache and memory hierarchy design, but any program
+utilizing those hooks would not be fully portable
+across hardware and operating system platforms.  
+.LP
+The key observation is that caches help reduce memory
+latency.  In a perfect world, all possible data would
+fit in the cache, so a graph of average memory latency
+versus amount of memory utilized would look like a
+series of plateaus separated by cliffs.  The cliff
+edges would be located at the cache boundaries and
+the plateau height would be the average memory latency.
+.LP
+The first problem is that one needs a mechanism for
+accurately measuring time in a portable fashion.  
+\*[lmbench2] introduced a new timing harness
+that determines the minimum duration of a timing interval
+for \*[gettimeofday] to provide accurate measurements
+.RN Staelin98 .
+.LP
+\*[lmbench] includes a benchmark that measures 
+average memory latency, \*[lat_mem_rd]
+.RN McVoy96 .
+It creates a pointer chain, and then measures the
+average time to dereference the pointers.  
+\*[lat_mem_rd] creates the pointer chain by simply
+striding through memory at fixed intervals, e.g.
+every other word.
+.LP
+\*[lmbench2] extended \*[lat_mem_rd] so
+that each timing interval only accessed memory
+as many times as necessary to consume a timing
+interval.  When accessing cache this often means
+that the whole pointer chain will be accessed
+at least once during the timing interval, but
+when accessing memory this often means that only
+a portion of the chain will be accessed during
+any given timing interval.
+.LP
+While this approach gives very useful insights
+into memory hierarchy performance, it is not
+quite sufficient to determine the various 
+characteristics of the memory hierarchy.
+.LP
+The first problem is that unless the stride is
+exactly the same size as the cache size, then
+there will either be multiple successive accesses
+to the same line, or some fraction of data
+will be completely skipped.  In the first case
+the observed latency is much faster than the
+true latency because it is the average of a
+single miss latency (slow) with one or more
+hit latencies (fast).  In the second case, the
+amount of data actually loaded into the cache
+may be a small fraction of the expected amount
+so the data may fit into a smaller (faster) 
+cache.
+The second problem is that this sequence is
+highly predictable, even by simple-minded
+prefetching policies, so accurate prefetching 
+might be masking the true memory latencies.
+.LP
+This method does do a few things properly.
+First of all, accesses to a single page are
+clustered together so the TLB miss cost (if
+any) is amortized over as many accesses as
+possible.  Secondly, assuming the pointer
+chain is laid out unpredictably, the memory
+subsystem must wait for the previous load
+to complete before it can initiate the
+next load, so we can measure the true latency.
+.NH 2
+Prefetching
+.LP
+Some memory subsystems have been highly optimized to
+recognize and automatically prefetch memory when 
+given "predictable" memory access streams, such as 
+when striding through array accesses.  This means that
+the memory access stream generated by \*[lmbench]
+must be unpredictable by the standard prediction
+algorithms.
+.LP
+The original \*[lmbench] memory latency benchmark, 
+lat_mem_rd, built a chain of pointers that would
+stride backwards through memory.  This was able to
+defeat many simple prefetching algorithms of the
+time, but some systems came to incorporate prefetching
+algorithms that recognized strided accesses in
+both directions.
+.LP
+The obvious method for producing an unpredictable
+chain of line references is to use a random
+permutation of line indexes.  
+.LP
+\*[lmbench] uses a deterministic algorithm to compute
+the reference chain which guarantees that references
+are as far away from previous accesses in both time
+and space as possible.  Basically, the binary bits
+representing the line index are reversed, so that
+1101 becomes 1011, or 001 becomes 100.  This only
+works if the number of cache lines is an even power
+of two, but since page sizes and line sizes are
+always powers of two, this assumption is valid.\**
+.FS 
+At least this is the case in every modern system known 
+to the author.
+.FE
+.LP
+Additionally, since higher-level caches can have
+smaller line sizes than lower-level caches, it
+is necessary to access every word in the relevant
+chunk of memory.  However, accesses to words in
+the same line must be separated in time by accesses
+to the rest of the memory.  This is achieved by
+identifying the line size for the largest cache,
+and then setting up the chain so that there is
+one pass through the memory for each word in the
+line with the sequence of words being determined
+by the bit-reversal method described above.  
+.LP
+For example, suppose a system has 4KB pages, the
+largest cache has a line size of 64bytes, and a
+word is 4bytes.  Then each page would have 64 lines,
+and each line would have 16 words.  The system
+would setup a pointer chain that visits each line
+on each page using the zeroth word; at the end of
+the chain it would then jump to the start of the
+pages and visit each line on each page using the
+eigth word, and so forth until each word had been
+visited.  
+.NH 2
+Dirty data
+.LP
+An additional issue that we need to take into 
+account is the cache's policy for dirty data.  
+Many caches use a copy-back policy, while others
+use a write-through policy.  
+.LP
+Different caches on the same machine may use
+different policies.  Also, cache performance
+can be affected by the presence of dirty data.
+For example, suppose both the L1 and L2 caches
+use a copy-back policy, and suppose that the
+access time for reading data located in L2
+depends on whether the data being ejected from
+L1 is dirty and needs to be copied back from L1
+to L2 before the read from L2 to L1.
+In this case, a benchmark which writes a pointer
+chain that fits in L2 but is larger than L1,
+and then measures the time to follow the chain,
+will get a different average memory latency than
+a benchmark which writes the same chain and
+reads enough data to flush the L2 cache before
+measuring the time to follow the chain.
+In the first case, each application read will
+result in a write from L1 to L2 followed by
+a read from L2 to L1, while in the second
+case each application read will only result
+in a read from L2 to L1.
+.LP
+Since it is possible that average memory latencies
+for a read-only access stream may be increased if
+any of the data in the cache is dirty, we need to
+flush the cache after setting up the pointer
+chains and before we do any measurements.  
+Otherwise, when we access a pointer chain that
+is larger than the L1 cache but smaller than the
+largest cache, dirty data can reside in the lowest
+(largest) cache and as each line is staged from
+the largest cache to the L1 cache, it is marked
+as dirty in the L1 cache.  Then when each dirty
+line is flushed from the L1 cache (to the L2
+cache), the system has to write the data back to
+L2, which delays the load of the next (dirty)
+line from L2 to L1.
+.LP
+To flush the cache we read (and sum) a large
+amount of memory, which should be several times
+larger than the largest cache.  In this way,
+all dirty data in the cache should be flushed
+from the cache without creating additional
+dirty data.
+.NH 2
+Page mapping
+.LP
+Complicating the issue still further is the fact that
+caches do not use full LRU replacement policies.  Nearly
+all caches use some form of set associativity, where
+pages are directed to a pool of cache lines based on
+the physical address.  Replacement within the pool is
+typically LRU.  Direct-mapped caches are a special case
+where the pool size is a single line.
+.LP
+Additionally, some systems use victim caches, which are
+typically small caches which caches recently discarded
+cache lines.  Victim caches can be particularly effective
+for direct-mapped caches by reducing the cache miss
+rate caused by colliding hot spots.
+.LP
+However, page mapping and its attendant cache collisions
+is under the control of the kernel, and is in fact 
+invisible to user-land programs.  Some operating
+systems make an effort to minimize possible page collisions 
+when giving memory to processes\**, while other operating 
+systems appear to simply grab the first available pages, 
+regardless of potential cache collision effects.
+.FS 
+This is generally known as "page coloring", and is much
+more important on systems with direct-mapped caches than
+those with N-way set associative caches.
+.FE
+.LP
+Factoring out page placement affects on average memory
+latency is very difficult, but it is necessary to 
+ensure that the correct cache size is identified.
+.NH 1
+Cache line size
+.LP
+The first feature of the memory hierarchy we
+will try to analyze is the cache line size,
+since we can find the line size for the 
+largest cache without any other knowledge of
+the system, and since determining nearly all
+other aspects of the memory subsystem either
+require or are greatly simplified by knowing
+the cache line size.
+.LP
+The most obvious aspect of cache design is that replacement
+is done on a per-line basis, and cache lines often contain
+several words of data (32-128bytes per line is common).
+However, it is necessary to ensure that we don't 
+generate "spurious" cache hits by referencing a word from 
+a cache line that was recently accessed.  We must ensure 
+that each line is only re-referenced after all other 
+memory in the buffer has been referenced.
+.LP
+Unfortunately, we usually do not know the cache line size
+ahead of time.  In addition, sometimes systems contain
+several caches, and each cache can use a different line
+size!  Usually line sizes are powers of two, and usually
+the smaller (higher) caches have line sizes which are the
+same or smaller than the larger (lower) caches.  However,
+we still need to ensure that we access all cache lines
+for all caches without generating the spurious cache hits.
+.LP
+Determining the cache line size requires a series of
+experiments.  The basic observation is that when the
+amount of memory being accessed is larger than the
+cache, and when the access chain is arranged properly,
+then each memory reference causes a cache miss.  If
+however, a word on a recently access line is requested,
+then that reference will be a cache hit.  More
+completely, the average memory access time $t bar$
+is:
+.EQ
+t bar = t sub miss + ( n - 1 ) t sub hit
+.EN
+expressed as a function of $n$, the number of accesses 
+to the cache line, $t sub miss$, the cache miss latency, 
+and $t sub hit$, the cache hit latency.
+.TSTART
+.G1
+.so memhier-line.d 
+.G2
+.FEND "Line Size"
+.LP
+We can determine the cache line size by measuring
+the average memory access latency over a series of
+memory access patterns: accessing every word, every
+other word, every fourth word, every eigth word, ...
+While the system is accessing multiple words per
+cache line, the average memory latency will be
+smaller than the cache miss latency, and as the
+space between accesses increases, the average
+memory increase will grow.
+When the system accesses only one word per line,
+the average memory latency will remain level even
+as the spacing between accesses increases.
+.LP
+It is possible to utilize this behavior to identify
+the cache line size.  The algorithm is to measure
+the average memory latency when each word is 
+accessed.  Then as you increase the space between
+accessed words (doubling the space each iteration),
+you look for a situation where the average latency
+increased dramatically, say greater than 30%,
+followed by a levelling off on the next iteration,
+say an increase less than 15%.  The line size is
+the last point where the average latency jumped
+dramatically.
+.NH 1
+TLB
+.LP
+Measuring the TLB-miss costs assumes that one can isolate
+those costs from the rest of the memory access costs.  The
+key observation is that it is often possible to create a
+situation in which all data being accessed resides in the
+cache, and yet it requires a TLB-miss to be able to locate
+it.
+.LP
+This program identifies the effective TLB size, rather 
+than the true TLB size.  First of all, from a programmer's
+point of view, it is really the effective TLB size that
+impacts program performance.  Secondly, there is no way
+for a user-land program to measure true TLB size because
+kernels sometimes pin some kernel page mappings into the 
+TLB and because some hardware/OS combinations 
+support "super-pages", or multi-page mappings.
+.LP
+We create two similar pointer chains with identical length
+and which reference an identical amount of memory, with one
+key difference.  In the first chain, the data is packed
+tightly into as few pages as possible, and references
+remain within a single page as long as possible.  The
+second chain spreads the data over as many pages as
+possible and jumps between pages at each reference.
+The two chains are arranged so that the same amount of
+data will fit into the cache, so that the raw memory
+access time for each chain is identical, within 
+experimental constraints.  The sole difference between
+average access costs should be the TLB-lookup times.
+.LP
+When the pages from the second chain fit into the TLB,
+the average access times for the two chains should be
+identical.  However, as soon as the number of pages in
+the second chain exceeds the TLB size, the second
+chain will start to pay TLB-miss costs.  Depending on
+the TLB replacement policy, the fraction of requests
+generating TLB-misses in the second chain can vary
+dramatically\**.  
+.FS 
+Pure LRU would ensure that as soon as the chain was one 
+page longer than the TLB size, every access would trigger 
+a TLB-miss.  However, other replacement algorithms might
+result in as few as $"number of pages" - "TLB size" + 1$
+misses per iteration over the loop.
+.FE
+.TSTART
+.G1
+.so memhier-tlb.d
+.G2
+.FEND "TLB"
+.LP
+The system must search for the point at which the
+average memory latency of the second chain diverges
+from the average latency of the first chain.  Since
+most systems have relatively small TLBs and since
+checking TLB sizes smaller than the effective TLB
+size is faster than checking TLB sizes larger than
+the TLB, the system starts with the guess of eight
+pages to establish a baseline.  It then iteratively
+doubles the number of pages until either a maximum
+limit has been reached or the average TLB-miss cost
+is greater than 15% of the average memory latency.
+Once it discovers the upper bound on the possible
+TLB size, it uses a binary search between the last
+two TLB size guesses to find the point at which
+the average latency for the two streams diverge.
+.NH 1
+Cache size
+.LP
+For the purpose of identifying the cache size, the
+ideal situation is that as long as the amount of 
+memory is equal to or less than the cache size, then
+all the data is in the cache and the average memory
+latency is the cache hit latency.  As soon as the
+memory doesn't fit in cache, then none of it should
+be in the cache, so the average memory latency is
+the cache miss latency.\**  When examining average
+memory latency versus memory size, this would give
+nice flat plateaus for each cache, with nice sharp
+transitions from one cache to the next, and from the
+largest cache to main memory.
+.FS
+Of course, for real programs, you want the average
+memory latency to be as low as possible, which means
+that you want as much of the data in cache as possible.
+.FE
+.LP
+However, the realities are that real data from real
+systems is corrupted in a variety of ways.  
+First of all, even when the memory can fit into the 
+cache, pages often collide in the cache and the 
+fraction of pages that have collisions often 
+increases as the amount of memory nears the cache size.  
+Secondly, even when the memory cannot fit into the 
+cache, there can be pages that do not collide. 
+Finally, there is simple experimental noise, which is 
+usually limited to 1% or less.  
+.LP
+The result of the first two problems is that on
+some systems, the average memory latency increases
+gradually as the memory size is increased.  There
+are no flat plateaus and sharp cliffs which make
+it easy to identify the number, size, and 
+performance of the caches.
+.NH 2
+Page coloring
+.LP
+The first problem is to create a set of pages
+which do not collide in the cache.
+The solution is to allocate more memory
+than necessary, and to try different combinations
+of pages to find the page set with the fastest
+average memory latency.  Unfortunately, the obvious
+algorithm is exponential in the number of pages.
+.TSTART
+.G1
+.so memhier-color.d
+.G2
+.FEND "Page Coloring Effects"
+.LP
+One observation is that cache misses are usually
+much more expensive than cache hits.  So, one
+possibility is to choose a random set of pages
+as the baseline and measure the average memory
+latency.  Then iterate over the pages, removing
+that page from the set and measuring the average
+memory latency of the reduced set.  If that page
+collides with another page, then the average 
+memory latency for the reduced set should be smaller
+than the average latency for the whole set.
+.LP
+Once a page that collides has been identified, then
+the system can iterate through available pages,
+try adding them to the reduced set and measuring
+the average memory latency.  If the page doesn't
+collide with any pages in the reduced set, then
+the average memory latency should drop still further.  
+In this way, the system could identify all
+colliding pages and replace them with pages
+that don't collide (assuming the memory all
+fits in the cache).
+.LP
+There are a number of problems with this simple approach.  
+First of all, it would take a very long time to run due 
+to the large, but polynomial, number of experiments required.  
+Secondly, as the memory size increases and the 
+number of pages involved gets large, the effect 
+of a single page on the average memory latency 
+can reach the level of experimental noise.
+.LP
+This approach makes the assumption that physical
+page locations do not change once the memory
+has been allocated.  In most systems, this 
+assumption is valid unless the memory is paged
+to disk.  However, at least IRIX includes an
+operating system configuration option to allow
+the operating system to dynamically relocate
+pages in memory.  This capability is disabled
+by default, so its use is relatively uncommon.
+It is possible that page relocation will become
+more common in the future, in which case this
+design may need to be revisited in the future.
+.LP
+Our algorithm uses this basic approach, but 
+attempts to reduce the number of experiments
+required by removing chunks of pages at a time.
+It will remove up to 5% of pages at a time
+and see if the average memory latency decreases
+significantly, in which case it examines the
+chunk a page at a time to find the page or
+pages which probably conflict.
+.LP
+An additional problem is that for large caches,
+the measured difference between two sets of
+pages with just one page collision difference
+can be very hard to measure.  For example,
+on a system with a 512Kbyte L2 cache and 4Kbyte
+pages, the cache can hold 128 pages.  Assuming
+that a cache miss is 200ns, a cache hit is 50ns,
+and 123 pages have no collisions but 5 pages
+collide, then the average memory latency is
+.EQ
+t bar = { 123 times 50 + 5 times 200 } over 128
+.EN
+or 55.85ns.  Suppose we remove one page and
+replace it with another page which doesn't
+collide, so we now have 4 collisions and
+124 pages without collisions, then the
+average memory latency is 54.68ns.  The
+difference is generally significant even
+in the face of experimental noise, but for
+larger caches the differences may recede 
+into the background noise.
+.LP
+As caches increase in size, the problems
+associated with detecting page collisions
+can only increase.  
+For example, an 8MB cache on a system with
+4KB pages would contain 2,048 pages.
+Removing a single page collision, even when
+the resulting memory latency for that page
+reduces by a factor of four, would simply
+result in an overall reduction in average
+memory latency of less than 0.2%, which is
+smaller than the average experimental measurement
+errors.
+.LP
+Additionally, as caches increase in size,
+effects such as cache consumption by the
+page table can begin to become important.
+.LP
+The single largest remaining problem in our
+system is that this algorithm does not 
+guarantee that we find a set of pages
+which do not contain any collisions in all
+cases that it \fImight\fR find such a set.
+It merely does so \fImost\fR of the time
+with (relatively) few measurements.
+.LP
+One possible means of dealing with this
+problem is to try an remove sets of pages
+in the hope that enough pages from a set
+of colliding pages will be removed at 
+once, so that the remaining pages from
+that collision set won't collide anymore.
+Suppose you have a 4-way set associative
+cache, and that you have six pages that
+collide.  If you remove two of the pages,
+then the remaining four pages don't collide
+anymore either.  This means that by 
+removing two pages we have removed six
+collisions, which should be easier to
+detect.
+.LP
+XXX Look into randomizing the pages
+after each iteration of the top-level
+loop to make this sort of serendipitious
+event more likely.
+.NH 2
+Measurement
+.LP
+In order to reduce the number of memory sizes
+that are measured by the system, we use a 
+binary search on memory sizes to find "edges"
+in the memory latency.
+We make the simplifying assumption that cache 
+sizes are either a power of two, or 1.5 times 
+a power of two.  In our experience, this assumption
+has been true.
+We also assume that no cache is smaller than
+512 bytes.
+.LP
+We explore the memory space at intervals
+equivalent to the most recent power of two
+divided by four.  So, starting at one 
+megabyte we would (potentially) measure
+memory latency at 1MB, 1.25MB, 1.5MB, and
+1.75MB.  This allows us to detect
+cache sizes at the desired intervals, since
+the measurement at the exact cache size
+can often be corrupted by other system
+activity so the next smaller measurement
+should still be valid.
+.LP
+XXX If the measurement size increment is
+several times larger than a page, then
+perhaps we should actually measure the 
+system with a couple pages less than the
+stated size?
+This would allow us some "slop" for
+collisions and might make it easier near
+cache boundaries to get accurate 
+measurements.
+The "slop" should probably be some fraction
+of the measurement increment size, such as
+10%, so it scales properly.
+.LP
+Since we start with a maximum size as a given,
+and we use 512 bytes as a minimum, and we can
+compute the full set of possible measurements,
+and initialize an array with the desired sizes.
+We can then use a modified binary search on
+this array to efficiently locate cache edges
+while still (potentially) leaving large, flat
+plateaus unexplored between the end points.
+.LP
+Finally, we assume that true memory latency
+is monotonically increasing with the amount
+of memory that you access.
+This means that if the measured latency ever
+decreases as you increase the amount of
+accessed memory, then the previous measurement
+must have been an error and the value is
+replaced by the smaller measurement.
+.NH 2
+Data analysis
+.LP
+Assuming the data collected by the system
+were noise-free and that the experimental
+system had managed to eliminate all artifacts
+such as page coloring effects, then the
+next problem is to analyze the data to find
+the number and size of the caches.
+Basically this means examining the data to
+find plateaus and cliffs.
+Each plateau would represent a cache, and the
+cliff represents the edge (size) of the cache.
+.LP
+Of course, real data is never perfect, and
+there are any number of issues which can
+affect the experimental results, so the
+analysis methodology must be robust to noise.
+.LP
+XXX describe analysis methodology here
+.NH 1
+Cache associativity
+.LP
+No modern caches are fully associative, meaning that
+no caches use LRU replacement, because the performance
+of such caches is insufficient.  Most caches are 
+either set associative or direct mapped, meaning
+that data from a given location can only go to
+one of a small number of cache lines, and in the
+case of a direct-mapped cache to a single cache line.
+.LP
+To determine the cache associativity we need to find
+a set of pages which have no page collisions and
+which (just) fit into the cache.  We then need to
+locate a page which collides with these pages and
+append it to the set.
+Then we can iterate through the pages in the initial
+page set, removing a page at a time, and comparing
+the resulting average memory latency with that of
+the full set.
+When the average memory latency drops significantly,
+then we know that this page conflicts with the
+full page set, and since the page set only has one
+conflict, we know it conflicts with the newly
+introduced page.
+The number of pages that conflict with this newly
+introduced page is the set associativity.
+.LP
+There is a potential bug in this algorithm 
+for systems with victim caches!  
+If the victim cache can hold at least a page 
+of data, then this algorithm cannot properly 
+determine the cache associativity because the 
+victim cache will play the role of additional 
+associative cache lines.
+.LP
+For smaller caches there is the additional
+problem that the cache associativity may not
+be smaller than the number of pages that the
+cache may hold.
+In which case, this simple approach will 
+never find pages that collide in the cache.
+The solution to this problem is to increase
+the line size and the number of pages so that 
+only portions of each page are accessed, and
+there can be enough pages to create collisions.
+.NH 1
+Memory parallelism
+.LP
+With the increasing memory bottleneck, most modern
+systems allow multiple outstanding memory references.
+On many systems, the effective parallelism depends
+on which part of the memory hierarchy is being
+accessed.  For example, L1 caches can often service 
+as many as six or eight outstanding requests, while main 
+memory systems can usually support at most two 
+outstanding requests.
+.LP
+To measure the available parallelism for a given
+chunk of memory, the system sets up a pointer
+chain running through the memory exactly the same
+as if it were to measure the average memory 
+latency.  It then uses fifteen different access
+routines, one for each possible level of parallelism.\**
+.FS 
+The assumption here is that no memory subsystem
+supports more than sixteen accesses in parallel.
+.FE
+Each routine dereferences $N$ pointers in parallel.
+For example, the inner loop of the routine where 
+$N=2$ would look something like this:
+.DS
+while (iterations-- > 0) {
+	p0 = (char**)*p0;
+	p1 = (char**)*p1;
+}
+.DE
+.LP
+The available parallelism is the maximum speedup
+over all N compared to the sequential case.
+.LP
+Note that this value is often not integral because
+many factors go into the effective parallelism,
+such as TLB contention, can limit the effective
+parallelism.
+.NH 1
+Conclusion
+.LP
+\*[lmbench] is a useful, portable micro-benchmark 
+suite designed to measure important aspects of 
+system performance.
+\*[lmbench3] adds a number of important extensions,
+such as the ability to measure system scalability.
+.NH 1
+Acknowledgments
+.LP
+Many people have provided invaluable help and insight into both the
+benchmarks themselves and the paper.  The \s-1USENIX\s0 reviewers
+were especially helpful.
+We thank all of them
+and especially thank:
+Wayne Scott \s-1(BitMover)\s0,
+Larry McVoy \s-1(BitMover)\s0,
+and
+Bruce Chapman \s-1(SUN)\s0.
+.LP
+We would also like to thank all of the people that have run the
+benchmark and contributed their results; none of this would have been possible
+without their assistance.
+.LP
+Our thanks to 
+all of the free software community for tools that were used during this
+project.
+\*[lmbench] is currently developed on Linux, a copylefted Unix written by 
+Linus Torvalds and his band of happy hackers.
+This paper and all of the 
+\*[lmbench] documentation was produced using
+the \f(CWgroff\fP suite of tools written by James Clark.
+Finally, all of the data processing of the results is done with
+\f(CWperl\fP written by Larry Wall.  
+.NH 1
+Obtaining the benchmarks
+.LP
+The benchmarks are available at
+.ft I
+http://ftp.bitmover.com/lmbench
+.ft
+.\" .R1
+.\" bibliography references-lmbench3
+.\" .R2
+.\"********************************************************************
+.\" Redefine the IP paragraph format so it won't insert a useless line
+.\" break when the paragraph tag is longer than the indent distance
+.\"
+.de @IP
+.if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2)
+.par*start \\n[\\n[.ev]:ai] 0
+.if !'\\$1'' \{\
+.	\" Divert the label so as to freeze any spaces.
+.	di par*label
+.	in 0
+.	nf
+\&\\$1
+.	di
+.	in
+.	fi
+.	chop par*label
+.	ti -\\n[\\n[.ev]:ai]u
+.	ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c
+.	el \{\
+\\*[par*label]
+.\".	br
+.	\}
+.	rm par*label
+.\}
+..
+.\"********************************************************************
+.\" redefine the way the reference tag is printed so it is enclosed in
+.\" square brackets
+.\"
+.de ref*end-print
+.ie d [F .IP "[\\*([F]" 2
+.el .XP
+\\*[ref*string]
+..
+.\"********************************************************************
+.\" Get journal number entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-N
+.ref*field N "" ( ) 
+..
+.\"********************************************************************
+.\" Get journal volume entries right.  Now will print as V(N) rather
+.\" than the awful V, N.
+.\"
+.de ref*add-V
+.ref*field V , "" "" ""
+..
+.\"********************************************************************
+.\" Get the date entry right.  Should not be enclosed in parentheses.
+.\"
+.de ref*add-D
+.ref*field D ","
+..
+.R1
+accumulate
+sort A+DT
+database references-userguide
+label-in-text
+label A.nD.y-2
+bracket-label [ ] ", "
+bibliography references-userguide
+.R2
+.\" .so bios
diff --git a/performance/lmbench3/hbench-REBUTTAL b/performance/lmbench3/hbench-REBUTTAL
new file mode 100644
index 0000000..b5788a7
--- /dev/null
+++ b/performance/lmbench3/hbench-REBUTTAL
@@ -0,0 +1,245 @@
+In June of 1997, Margo Seltzer and Aaron Brown published a paper in
+Sigmetrics called "Operating System Benchmarking in the Wake of Lmbench:
+A Case Study of the Performance of NetBSD on the Intel x86 Architecture".
+
+
+This papers claims to have found flaws in the original lmbench work.
+With the exception of one bug, which we have of course fixed, we find
+the claims inaccurate, misleading, and petty.  We don't understand
+what appears to be a pointless attack on something that has obviously
+helped many researchers and industry people alike.  lmbench was warmly
+received and is widely used and referenced.  We stand firmly behind the
+work and results of the original benchmark.  We continue to improve and
+extend the benchmark.  Our focus continues to be on providing a useful,
+accurate, portable benchmark suite that is widely used.  As always, we
+welcome constructive feedback.
+
+
+To ease the concerns of gentle benchmarkers around the world, we have
+spent at least 4 weeks reverifying the results.  We modified lmbench to
+eliminate any effects of
+
+	. clock resolution
+	. loop overhead
+	. timing interface overhead
+
+Our prediction was that that this would not make any difference and our
+prediction was correct.  All of the results reported in lmbench 1.x are
+valid except the file reread benchmark which may be 20% optimistic on
+some platforms.
+
+We've spent a great deal of time and energy, for free, at the expense
+of our full time jobs, to address the issues raised by hbench.  We feel
+that we were needlessly forced into a lose/lose situation of arguing
+with a fellow researcher.  We intend no disrespect towards their work,
+but did not feel that it was appropriate for what we see as incorrect
+and misleading claims to go unanswered.
+
+We wish to move on to the more interesting and fruitful work of extending
+lmbench in substantial ways.
+
+Larry McVoy & Carl Staelin, June 1997
+
+--------------------------------------------------------------------------
+
+Detailed responses to their claims:
+
+Claim 1: 
+
+	"it did not have the statistical rigor and self-consistency
+	needed for detailed architectural studies"
+
+Reply:
+
+	This is an unsubstantiated claim.  There are no numbers which back 
+	up this claim.  
+
+Claim 2: 
+
+	"with a reasonable compiler, the test designed to read and touch
+	data from the file system buffer cache never actually touched
+	the data"
+
+Reply:
+	
+	Yes, this was a bug in lmbench 1.0.  It has been fixed.
+	On platforms such as a 120 Mhz Pentium, we see change of a 20%
+	in the results, i.e., without the bug fix it is about 20% faster.
+
+Claim 3:
+
+    This is a multi part claim:
+
+	a) gettimeofday() is too coarse.
+
+Reply:  
+    	
+	The implication is that there are number of benchmarks in
+	lmbench that finish in less time than the clock resolution
+	with correspondingly incorrect results.  There is exactly one
+	benchmark, TCP connection latency, where this is true and that
+	is by design, not by mistake.  All other tests run long enough 
+	to overcome 10ms clocks (most modern clocks are microsecond
+	resolution).
+
+	Seltzer/Brown point out that lmbench 1.x couldn't accurately
+	measure the L1/L2 cache bandwidths.  lmbench 1.x didn't attempt
+	to report L1/L2 cache bandwidths so it would seem a little
+	unreasonable to imply inaccuracy in something the benchmark
+	didn't measure.  It's not hard to get this right by the way, we
+	do so handily in lmbench 2.0.
+
+
+    	b)  TCP connection latency is reported as 0 on the DEC Alpha.
+
+Reply:
+
+	We could have easily run the TCP latency connection benchmark in
+	a loop long enough to overcome the clock resolution.  We were,
+	and are, well aware of the problem on DEC Alpha boxes.	We run
+	only a few interations of this benchmark because the benchmark
+	causes a large number of sockets to get stuck in TIME_WAIT,
+	part of the TCP shutdown protocol.   Almost all protocol stacks
+	degrade somewhat in performance when there are large numbers of
+	old sockets in their queues.  We felt that showing the degraded
+	performance was not representative of what users would see.
+	So we run only for a small number (about 1000) interations and
+	report the result.  We would not consider changing the benchmark
+	the correct answer - DEC needs to fix their clocks if they wish
+	to see accurate results for this test.
+
+	We would welcome a portable solution to this problem.  Reading
+	hardware specific cycle counters is not portable.
+
+Claim 4:
+
+	"lmbench [..] was inconsistent in its statistical treatment of
+	the data"
+	...
+	"The most-used statistical policy in lmbench is to take the
+	minimum of a few repetitions of the measurement"
+
+Reply:
+
+	Both of these claims are false, as can be seen by a quick inspection
+	of the code.   The most commonly used timing method (16/19 tests
+	use this) is
+
+		start_timing
+		do the test N times
+		stop_timing
+		report results in terms of duration / N
+	
+	In fact, the /only/ case where a minimum is used is in the
+	context switch test.
+
+	The claim goes on to try and say that taking the minimum causes
+	incorrect results in the case of the context switch test.
+	Another unsupportable claim, one that shows a clear lack of
+	understanding of the context switch test.  The real issue is cache
+	conflicts due to page placement in the cache.  Page placement is
+	something not under our control, it is under the control of the
+	operating system.  We did not, and do not, subscribe to the theory
+	that one should use better ``statistical methods'' to eliminate
+	the variance in the context switch benchmark.  The variance is
+	what actually happened and happens to real applications.
+
+	The authors also claim "if the virtually-contiguous pages of
+	the buffer are randomly assigned to physical addresses, as they
+	are in many systems, ... then there is a good probability that
+	pages of the buffer will conflict in the cache".
+
+	We agree with the second part but heartily disagree with
+	the first.  It's true that NetBSD doesn't solve this problem.
+	It doesn't follow that others don't.  Any vendor supplied
+	operating system that didn't do this on a direct mapped L2
+	cache would suffer dramatically compared to it's competition.
+	We know for a fact that Solaris, IRIX, and HPUX do this.
+
+	A final claim is that they produced a modified version of the
+	context switch benchmark that does not have the variance of
+	the lmbench version.  We could not support this.  We ran that
+	benchmark on an SGI MP and saw the same variance as the original
+	benchmark.
+
+Claim 5:
+
+	"The lmbench bandwidth tests use inconsistent methods of accessing
+	memory, making it hard to directly compare the results of, say
+	memory read bandwidth with memory write bandwidth, or file reread
+	bandwidth with memory copy bandwidth"
+	...
+	"On the Alpha processor, memory read bandwidth via array indexing
+	is 26% faster than via pointer indirection; the Pentium Pro is
+	67% faster when reading with array indexing, and an unpipelined
+	i386 is about 10% slower when writing with pointer indirection"
+
+Reply:
+	In reading that, it would appear that they are suggesting that
+	their numbers are up to 67% different than the lmbench numbers.
+	We can only assume that this was delibrately misleading.
+	Our results are identical to theirs.  How can this be?
+
+		. We used array indexing for reads, so did they.
+		  They /implied/ that we did it differently, when in fact
+		  we use exactly the same technique.  They get about
+		  87MB/sec on reads on a P6, so do we.	We challenge
+		  the authors to demonstrate the implied 67% difference
+		  between their numbers and ours.  In fact, we challenge
+		  them to demonstrate a 1% difference.
+
+		. We use pointers for writes exactly because we wanted
+		  comparable numbers.  The read case is a load and
+		  an integer add per word.  If we used array indexing
+		  for the stores, it would be only a store per word.
+		  On older systems, the stores can appear to go faster
+		  because the load/add is slower than a single store.
+
+	While the authors did their best to confuse the issue, the
+	results speak for themselves.  We coded up the write benchmark
+	our way and their way.  Results for a Intel P6:
+
+			pointer	  array	   difference
+		L1 $	587	  710	      18%
+		L2 $	414	  398	       4%
+		memory   53	   53	       0%
+	
+
+Claim 5a:
+	The harmonic mean stuff.
+
+Reply:
+	They just don't understand modern architectures.  The harmonic mean
+	theory is fine if and only if the process can't do two things at
+	once.  Many modern processors can indeed do more than one thing at
+	once, the concept is known as super scalar, and can and does include
+	load/store units.  If the processor supports both outstanding loads
+	and outstanding stores, the harmonic mean theory fails.
+
+Claim 6:
+
+	"we modified the memory copy bandwidth to use the same size
+	data types as the memory read and write benchmark (which use the
+	machine's native word size); originally, on 32-bit machines, the
+	copy benchmark used 64-bit types whereas the memory read/write
+	bandwidth tests used 32- bit types"
+
+Reply:
+
+	The change was to use 32 bit types for bcopy.  On even relatively
+	modern systems, such as a 586, this change has no impact - the
+	benchmark is bound by memory sub systems.  On older systems, the
+	use of multiple load/store instructions, as required for the smaller
+	types, resulted in lower results than the meory system could produce.
+
+	The processor cycles required actually slow down the results.  This 
+	is still true today for in cache numbers.  For example, an R10K
+	shows L1 cache bandwidths of 750MB/sec and 377MB/sec with 64 bit
+	vs 32 bit loads.  It was our intention to show the larger number and
+	that requires the larger types.  
+	
+	Perhaps because the authors have not ported their benchmark to
+	non-Intel platforms, they have not noticed this.  The Intel
+	platform does not have native 64 bit types so it does two
+	load/stores for what C says is a 64 bit type.  Just because it 
+	makes no difference on Intel does not mean it makes no difference.
diff --git a/performance/lmbench3/results/Makefile b/performance/lmbench3/results/Makefile
new file mode 100644
index 0000000..024916a
--- /dev/null
+++ b/performance/lmbench3/results/Makefile
@@ -0,0 +1,320 @@
+# Makefile for lmbench results.
+# $Id: Makefile 1.11 00/01/31 16:29:28-08:00 lm@xxxxxxxxxxxxxxx $
+#
+# Usage: make [ LIST="aix/* sunos/* ..." ] [ what ]
+#
+# What to make:
+#	print			Prints the results 1 per page.
+#	ps			Saves the postscript of 1 per page in PS/PS
+#	4.ps			Saves the postscript of 4 per page in PS/PS4
+#	8.ps			Saves the postscript of 8 per page in PS/PS8
+#	x			Previews 1 per page using groff -X
+#	summary	[default]	Ascii summary of the results
+#	stats			Do statistics over a set of results
+#	roff			Print the ascii summaries into a roff file
+#	slides			Makes the pic for inclusion in slides
+#
+# This Makefile requires groff, gpic, and perl.  You could try it with
+# other *roff processors; I have no idea if it works.
+#
+# XXX - this is all out of date.
+#
+# There are three sorts of graphical results:
+#
+# 1. Bargraphs comparing each system in the LIST on the measurements listed
+#    in the BG list below (pretty much everything).
+# 2. A 2-D graph for each system in LIST, displaying context switch times
+#    as a function of (# of processes, size of each process).
+# 3. A 2-D graph for each system in LIST, displaying memory read times as
+#    a function of (stride size, memory size).
+#
+# The bargraphs are in a format of my own - the perl script in scripts
+# called bargraph takes them as input and produces pic as output.
+# It is a pretty straightforward format, you could probably incorparate
+# into some Windows spreadsheet if you wanted to.  See tmp/*.bg after
+# running make in this directory.
+#
+# The 2-D graphs are in a format that can (probably) be read by Xgraph.
+# I've added a few extensions for titles, etc., that you could just
+# take out.  See tmp/mem.* after running a make in this directory.
+#
+# This Makefile is of marginal usefulness to a site with just one machine.
+# I intend to make results available so that people can compare, as well
+# as a service where you can compare your results against the "best of
+# the breed" for each vendor, as well as against best of the lot.
+
+# List of result files to process.  Defaults to everything.
+LIST=	`$(SCRIPTS)getlist $(LST)`	
+
+# Grrrrr
+SHELL=/bin/sh
+
+SCRIPTS=../scripts/
+SRCS= ../scripts/allctx ../scripts/allmem ../scripts/bargraph \
+	../scripts/bghtml ../scripts/getbg ../scripts/getbw \
+	../scripts/getctx ../scripts/getdisk ../scripts/getlist \
+	../scripts/getmax ../scripts/getmem ../scripts/getpercent \
+	../scripts/getresults ../scripts/getsummary ../scripts/gifs \
+	../scripts/graph ../scripts/html-list ../scripts/html-man \
+	../scripts/os ../scripts/percent ../scripts/save \
+	../scripts/stats ../scripts/xroff 
+
+MISC=	tmp/misc_mhz.bg \
+	tmp/lat_ctx.bg \
+	tmp/lat_ctx8.bg \
+	tmp/lat_nullsys.bg \
+	tmp/lat_signal.bg \
+	tmp/lat_pagefault.bg \
+	tmp/lat_mappings.bg \
+	tmp/lat_fs_create.bg
+
+PROC=	tmp/lat_nullproc.bg \
+	tmp/lat_simpleproc.bg \
+	tmp/lat_shproc.bg
+
+LATENCY= \
+	tmp/lat_pipe.bg \
+	tmp/lat_connect.bg \
+	tmp/lat_udp_local.bg \
+	tmp/lat_rpc_udp_local.bg \
+	tmp/lat_tcp_local.bg  \
+	tmp/lat_rpc_tcp_local.bg 
+
+BANDWIDTH= \
+	tmp/bw_pipe.bg \
+	tmp/bw_tcp_local.bg \
+	tmp/bw_file.bg \
+	tmp/bw_reread.bg \
+	tmp/bw_mmap.bg \
+	tmp/bw_bcopy_libc.bg \
+	tmp/bw_bcopy_unrolled.bg \
+	tmp/bw_mem_rdsum.bg \
+	tmp/bw_mem_wr.bg
+
+BG=	$(MISC) $(PROC) $(LATENCY) $(BANDWIDTH)
+
+MK=@$(MAKE) -s
+PRINT=groff -p | lpr -h
+PS=groff -p | $(SCRIPTS)save PS/PS
+PS8UP=groff -p | mpage -P- -8 -a | $(SCRIPTS)save PS/PS8
+PS4UP=groff -p | mpage -P- -4 -a | $(SCRIPTS)save PS/PS4
+SIZE=-big 
+IMAGE=pbm
+CLOSE=
+GMEM=$(CLOSE) -grid -logx -xm -below
+GCTX=$(CLOSE) -grid -below
+GDISK=-below -close -grid -nolines
+#IMAGE=gifmono
+
+summary: $(SRCS)
+	@$(SCRIPTS)getsummary $(LIST)
+
+percent: $(SRCS)
+	@$(SCRIPTS)getpercent $(LIST)
+
+stats: $(SRCS)
+	$(SCRIPTS)getsummary $(LIST) | $(SCRIPTS)percent
+
+roff:
+	echo .nf	> summary.roff
+	echo .ft CB	>> summary.roff
+	echo .ps 12	>> summary.roff
+	echo .po .35i	>> summary.roff
+	echo .sp .5i	>> summary.roff
+	make LIST="$(LIST)" summary	>> summary.roff
+	echo .bp	>> summary.roff
+	echo .sp .5i	>> summary.roff
+	make LIST="$(LIST)" percent	>> summary.roff
+
+list:
+	@echo $(LIST)
+
+print: ctx mem disk bwfile bwmem
+
+8:
+	$(MK) LIST="$(LIST)" PRINT="groff -p | mpage -P -8 -a | lpr -h" print
+
+8.ps 8ps 8up:
+	$(MK) LIST="$(LIST)" PRINT="$(PS8UP)" print
+
+4.ps 4ps 4up:
+	$(MK) LIST="$(LIST)" PRINT="$(PS4UP)" print
+
+ps:
+	$(MK) LIST="$(LIST)" PRINT="$(PS)" print
+
+smallps:
+	$(MK) LIST="$(LIST)" SIZE= PRINT="groff -p | $(SCRIPTS)save PS/smallPS" print
+
+x: 
+	$(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" print
+
+ctx.x: 
+	$(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" ctx
+
+ctx.ps4:
+	$(MK) LIST="$(LIST)" PRINT="$(PS4UP)" ctx
+
+mem.x: 
+	$(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" mem
+
+disk.x: 
+	$(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" disk
+
+bwfile.ps: 
+	$(MK) LIST="$(LIST)" PRINT="$(PS)" bwfile
+
+bwfile.x: 
+	$(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" bwfile
+
+bwmem.ps: 
+	$(MK) LIST="$(LIST)" PRINT="$(PS)" bwmem
+
+bwmem.x: 
+	$(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" bwmem
+
+smallx:
+	$(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" SIZE= print
+
+slides:
+	$(MK) LIST="$(LIST)" SIZE=-slide bargraphs.slides ctx.slides mem.slides
+
+paper:
+	$(MK) LIST="$(LIST)" tbl.paper ctx.paper mem.paper
+
+# XXX - this has to be made incremental, doing everything over from
+# scratch makes you want a Ghz machine.
+html:
+	-make clean
+	#$(SCRIPTS)bghtml $(BG)
+	$(SCRIPTS)html-list $(LIST)
+	$(MK) LIST="$(LIST)" summary > HTML/summary
+	#make LIST="$(LIST)" percent > HTML/percent
+	$(MK) LIST="$(LIST)" SIZE=  PRINT="$(PS)" \
+	    GMEM="$(GMEM) -cut -gthk1" GCTX="$(GCTX) -cut -gthk1" print
+	$(MK) LIST="$(LIST)" SIZE= NOOP=-noop PRINT="$(PS)" \
+	    GMEM="$(GMEM) -cut -gthk1" GCTX="$(GCTX) -cut -gthk1" print
+	gs -sOutputFile=HTML/ctx%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS < /dev/null
+	gs -sOutputFile=HTML/mem%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.1 < /dev/null
+	gs -sOutputFile=HTML/disk%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.2 < /dev/null
+	gs -sOutputFile=HTML/bwfile%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.3 < /dev/null
+	gs -sOutputFile=HTML/bwmem%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.4 < /dev/null
+	gs -sOutputFile=HTML/ctx-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.5 < /dev/null
+	gs -sOutputFile=HTML/mem-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.6 < /dev/null
+	gs -sOutputFile=HTML/bwfile-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.7 < /dev/null
+	gs -sOutputFile=HTML/bwmem-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.8 < /dev/null
+	$(SCRIPTS)/gifs
+	rm HTML/*.pbm HTML/___tmp*
+
+bghtml:
+	$(SCRIPTS)bghtml $(BG)
+
+html-list:
+	$(SCRIPTS)html-list $(LIST)
+
+ctx: dirs
+	$(SCRIPTS)getctx $(LIST) > tmp/FILES
+	@if [ -s tmp/FILES ]; \
+	then	$(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \
+		for i in `cat tmp/FILES`; \
+		do	$(SCRIPTS)graph $(SIZE) $(GCTX) $$i; \
+			echo .bp; \
+		done | sed '$$d' | $(PRINT); \
+	else	echo No context switch data in $(LIST); \
+	fi
+
+disk: dirs
+	if [ X$(NOOP) = X ]; then \
+		$(SCRIPTS)getdisk $(LIST) > tmp/FILES; \
+		if [ -s tmp/FILES ]; \
+		then	for i in `cat tmp/FILES`; \
+			do	$(SCRIPTS)graph $(SIZE) $(GDISK) $$i; \
+				echo .bp; \
+        		done | sed '$$d' | $(PRINT); \
+		else	echo No disk data in $(LIST); \
+		fi; \
+	fi
+
+mem: dirs
+	$(SCRIPTS)getmem $(LIST) > tmp/FILES
+	if [ -s tmp/FILES ]; \
+	then	$(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \
+		for i in `cat tmp/FILES`; \
+		do	$(SCRIPTS)graph $(SIZE) $(GMEM) -nomarks $$i; \
+			echo .bp; \
+        	done | sed '$$d' | $(PRINT); \
+	else	echo No memory latency data in $(LIST); \
+	fi
+
+bwfile: dirs
+	$(SCRIPTS)getbw $(LIST) > tmp/FILES
+	if [ -s tmp/FILES ]; \
+	then	$(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \
+		for i in `cat tmp/FILES`; \
+		do	$(SCRIPTS)graph $(SIZE) $(GMEM) -logy $$i; \
+			echo .bp; \
+        	done | sed '$$d' | $(PRINT); \
+	else	echo No file bandwidth data in $(LIST); \
+	fi
+
+bwmem: dirs
+	$(SCRIPTS)getbw -all $(LIST) > tmp/FILES
+	if [ -s tmp/FILES ]; \
+	then	$(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \
+		for i in `cat tmp/FILES`; \
+		do	$(SCRIPTS)graph -halfgrid -gthk_5 -thk2 -medium \
+			    -nomarks -nolabels -grapheach $(GMEM) \
+			    -logy %P="'`basename $$i`'" $$i; \
+			echo .bp; \
+        	done | sed '$$d' | $(PRINT); \
+	else	echo No memory bandwidth data in $(LIST); \
+	fi
+
+tbl.paper:
+	$(SCRIPTS)getbg -paper $(LIST) 
+
+
+bargraphs.1st: dirs
+	$(SCRIPTS)getbg -nosort $(LIST)
+	#$(SCRIPTS)getmax -v $(PROC)
+	#$(SCRIPTS)getmax -v $(LATENCY)
+	#$(SCRIPTS)getmax -v -half $(BANDWIDTH)
+
+bargraphs: bargraphs.1st
+	for i in $(BG); \
+	do	$(SCRIPTS)bargraph $(SIZE) -nobox -sideways $$i; \
+		echo .bp; \
+        done | sed '$$d' | $(PRINT)
+
+bargraphs.slides: bargraphs.1st
+	for i in $(BG); \
+	do	$(SCRIPTS)bargraph $(SIZE) -nobox -sideways $$i > $${i}.pic; \
+        done 
+
+bargraphs.8up: bargraphs.1st
+	for i in $(BG); \
+	do	$(SCRIPTS)bargraph -sideways $(SIZE) -nobox $$i; \
+		echo .bp; \
+	done | sed '$$d' | $(PS8UP)
+
+latency.8up: bargraphs.1st
+	for i in $(LATENCY); \
+	do	$(SCRIPTS)bargraph -sideways $(SIZE) -nobox $$i; \
+		echo .bp; \
+	done | sed '$$d' | $(PS8UP)
+
+bw.8up: bargraphs.1st
+	for i in $(BANDWIDTH); \
+	do	$(SCRIPTS)bargraph -sideways $(SIZE) -nobox $$i; \
+		echo .bp; \
+	done | sed '$$d' | $(PS8UP)
+
+get:	# nothing to do
+
+clean:
+	/bin/rm -f PS/* GIF/* HTML/* tmp/* summary.roff
+
+dirs:
+	@if [ ! -d tmp ]; then mkdir tmp; fi
+	@if [ ! -d PS ]; then mkdir PS; fi
+	@if [ ! -d HTML ]; then mkdir HTML; fi
diff --git a/performance/lmbench3/runtest.sh b/performance/lmbench3/runtest.sh
new file mode 100755
index 0000000..3a81c6d
--- /dev/null
+++ b/performance/lmbench3/runtest.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+if [ -f bin/$(uname -m)-linux-gnu/$(scripts/config) ]; then
+	make rerun
+else
+	make
+	make results
+fi
+
+cd results
+make summary
+
+exit 0
diff --git a/performance/lmbench3/scripts/Makefile b/performance/lmbench3/scripts/Makefile
new file mode 100644
index 0000000..7abca50
--- /dev/null
+++ b/performance/lmbench3/scripts/Makefile
@@ -0,0 +1,8 @@
+# Makefile for lmbench scripts subdir.
+#$Id: Makefile 1.3 00/01/31 16:29:28-08:00 lm@xxxxxxxxxxxxxxx $
+
+get:
+	get -s 
+
+clean:
+
diff --git a/performance/lmbench3/scripts/README b/performance/lmbench3/scripts/README
new file mode 100644
index 0000000..6e84ad1
--- /dev/null
+++ b/performance/lmbench3/scripts/README
@@ -0,0 +1,7 @@
+$Id: README 1.2 97/06/14 21:10:42-07:00 lm@xxxxxxxxxxxxxxx $
+
+This directory contains scripts used to generate or post process lmbench
+output.  You probably do not want to be here or run these by hand, the
+Makefiles in ../src and ../results invoke these.  There are some useful
+scripts here, however, in particular the graphing scripts.  If you are
+interested in groff graphing tools, check out ../doc/*graph.1.
diff --git a/performance/lmbench3/scripts/SHIT b/performance/lmbench3/scripts/SHIT
new file mode 100644
index 0000000..de2a060
--- /dev/null
+++ b/performance/lmbench3/scripts/SHIT
@@ -0,0 +1,724 @@
+
+# Go find perl if we are running this as a shell script.
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+# Mimic the BSD tool, sccs, for RCS.
+# $Id: SHIT 1.2 95/11/29 12:39:38-08:00 lm@xxxxxxxxxxxxxxx $
+#
+# Note - this reflects a lot of my personal taste.  I'll try and list the
+# important differences here:
+#
+# A bunch of unused commands are not implemented.  It is easy to add them,
+# mail me if you want me to add something.  Please include a spec of what
+# you want the command to do.  Mail lm@xxxxxxxxxxxx.
+#
+# I look at RCS file internals and know about certain fields as of revision
+# 5.x.
+#
+# This interface does not require a list of files/directories for most
+# commands; the implied list is *,v and/or RCS/*,v.  Destructive commands,
+# such as clean -f, unedit, unget, do *not* have an implied list.  In
+# other words, 
+#	rccs diffs	is the same as		rccs diffs RCS
+# but
+#	rccs unedit	is not the same as	rccs unedit RCS
+#
+# If you add (potentially) destructive commands, please check for
+# them in main() and make sure that the autoexpand does not happen.
+#
+# TODO:
+#	Make it so that you can pass a list of files/dirs via stdin.
+#
+#	It might be nice to have all the "system" args printed out in
+#	verbose and/or learn mode.  Depends on whether you want people
+#	to learn RCS or not.
+
+&init;
+&main;
+
+sub init
+{
+	$0 =~ s|.*/||;
+	# Add commands here so that -w shuts up.
+	$lint = 0;
+
+	&clean() && &create() && &example() && &get() && &edit() &&
+	&unedit() && &unget() && &diffs() && &delta() && &help() &&
+	&prs() && &prt() && &deledit() && &delget() && &enter() &&
+	&info() && &ci() && &co() && &fix() && &print()
+	    if $lint;
+}
+
+sub help
+{
+	if ($#_ == -1) {
+		&usage;
+	}
+	
+	# Handle all the aliases.
+	if ($_[0] eq "unedit" || $_[0] eq "unget") {
+		&help("clean");
+	} elsif ($_[0] eq "clean") {
+	}
+	warn "Extended help on @_ not available yet.\n";
+}
+
+sub usage
+{
+print <<EOF;
+
+usage: $0 [$0 opts] command [args] [file and/or directory list]
+
+$0 options are:
+    -debug	for debugging of $0 itself
+    -verbose	for more information about what $0 is doing
+
+More information may be had by saying "$0 help subcommand".
+
+Most commands take "-s" to mean do the work silently.
+
+Command		Effect
+-------		------
+    clean -	remove unedited (ro) working files
+	-e	remove unmodified edited (rw) & unedited (ro) files
+	-f	(force) remove modified working files as well
+    create -	add a set of files to RCS control and get (co) the working files
+	-g	do not do the get (co) of the working files
+	-y<msg>	use <msg> as the description message (aka -d<msg>)
+    delta -	check in a revision
+	-y<msg>	use <msg> as the log message (aka -d<msg>)
+	-s
+    diffs -	diff the working file against the RCS file
+    fix -	redit the last revision
+    get -	get the working file[s] (possibly for editing)
+    history -	print history of the files
+    print -	print the history and the latest contents
+
+Alias		Real command	Effect
+-----		------------	------
+    ci -	delta		check in a revision
+    co -	get		check out a revision
+    enter -	create -g	initialize a file without a get afterward
+    unedit -	clean -f	remove working file even if modified
+    unget -	clean -f	remove working file even if modified
+    edit -	get -e		check out the file for editing
+    prs -	history		print change log history
+    prt -	history		print change log history
+
+An implied list of *,v and/or RCS/*,v is implied for most commands.
+The exceptions are commands that are potentially destructive, such as
+unedit.
+
+EOF
+
+	exit 0;
+}
+
+sub main
+{
+	local($cmd);
+	local(@args);
+	local(@comma_v);
+
+	$cmd = "oops";
+	$cmd = shift(@ARGV) if $#ARGV > -1;
+	&help(@ARGV) if $cmd eq "help" || $cmd eq "oops";
+
+	$dir_specified = $file_specified = 0;
+	foreach $_ (@ARGV) {
+		# If it is an option, just pass it through.
+		if (/^-/) {
+			push(@args, $_);
+		}
+		# If they specified an RCS directory, explode it into ,v files.
+		elsif (-d $_) {
+			$dir_specified = 1;
+			warn "Exploding $_\n" if $debug;
+			push(@args, grep(/,v$/, &filelist($_)));
+			push(@args, grep(/,v$/, &filelist("$_/RCS")));
+		}
+		# If it is a file, make it be the ,v file.
+		else {
+			if (!/,v$/) {
+				# XXX - what if both ./xxx,v and ./RCS/xxx,v?
+				if (-f "$_,v") {
+					$_ .= ",v";
+				} else {
+					if (m|/|) {
+						m|(.*)/(.*)|;
+						$f = "$1/RCS/$2,v";
+					} else {
+						$f = "RCS/$_,v";
+					}
+					if (-f $f) {	
+						$_ = $f;
+					}
+				}
+			}
+			if (-f $_) {
+				$file_specified = 1;
+				warn "Adding $_\n" if $debug;
+				push(@args, $_);
+			} else {
+				warn "$0: skipping $_, no RCS file.\n";
+			}
+		}
+	}
+
+	# Figure out if it is a potentially destructive command.  These
+	# commands do not automagically expand *,v and RCS/*,v.
+	$destructive = ($cmd eq "clean" && $args[0] eq "-f") ||
+	    $cmd eq "unedit" || $cmd eq "unget";
+        
+	# If they didn't specify a file or a directory, generate a list
+	# of all ./*,v and ./RCS/*,v files.
+	unless ($destructive || $dir_specified || $file_specified) {
+		warn "Exploding . && ./RCS\n" if $debug;
+		push(@args, grep(/,v$/, &filelist(".")));
+		push(@args, grep(/,v$/, &filelist("RCS")));
+	}
+
+	unless ($cmd =~ /^create$/) {
+		@comma_v = grep(/,v$/, @args);
+		if ($#comma_v == -1) {
+			($s = "$cmd @ARGV") =~ s/\s+$//;
+			die "$0 $s: No RCS files specified.\n";
+		}
+	}
+	
+	# Exit codes:
+	#	0 - it worked
+	#	1 - unspecified error
+	#	2 - command unknown
+	$exit = 2;
+	warn "Trying &$cmd(@args)\n" if $debug;
+	eval(&$cmd(@args));
+
+	if ($exit == 2) {
+		warn "Possible unknown/unimplemented command: $cmd\n";
+		&usage;
+	} else {
+		exit $exit;
+	}
+}
+
+# Read the directory and return a list of files.
+# XXX - isn't there a builtin that does this?
+sub filelist
+{
+	local(@entries) = ();
+	local($ent);
+
+	opendir(DFD, $_[0]) || return ();
+	foreach $ent (readdir(DFD)) {
+		$ent = "$_[0]/$ent";
+		next unless -f $ent;
+		push(@entries, $ent);
+	}
+	warn "filelist($_[0]): @entries\n" if $debug;
+	@entries;
+}
+
+# Take a list of ,v files and return a list of associated working files.
+sub working
+{
+	local(@working, $working) = ();
+
+	foreach $comma_v (@_) {
+		# Strip the ,v.
+		# Strip the RCS specification.
+		($working = $comma_v) =~ s|,v$||;
+		$working =~ s|RCS/||;
+		push(@working, $working);
+	}
+	@working;
+}
+
+# Same as "clean -f" - throw away all changes
+sub unedit { &clean("-f", @_); }
+sub unget { &clean("-f", @_); }
+
+# Get rid of everything that isn't edited and has an associated RCS file.
+# -e	remove edited files that have not been changed.
+# -f	remove files that are edited with changes (CAREFUL!)
+#	This implies the -e opt.
+# -d<m>	Check in files that have been modified.  If no message, prompt
+#	on each file.  This implies -e.
+# -y<m>	Like -d for people that are used to SCCS.
+# -m<m>	Like -d for people that are used to RCS.
+#
+# Note: this does not use rcsclean; I don't know when that showed up.  And
+# the 5.x release of RCS I have does not install it.
+sub clean
+{
+	local(@working);
+	local($e_opt, $f_opt, $d_opt, $s_opt) = (0,0,0,0);
+	local($msg);
+	local(@checkins) = ();
+
+	while ($_[0] =~ /^-/) {
+		if ($_[0] eq "-s") {
+			$s_opt = 1;
+			shift(@_);
+		} elsif ($_[0] eq "-e") {
+			$e_opt = 1;
+			shift(@_);
+		} elsif ($_[0] eq "-f") {
+			$f_opt = $e_opt = 1;
+			shift(@_);
+		} elsif ($_[0] =~ /^-[dym]/) {
+			$d_opt = $e_opt = 1;
+			if ($_[0] =~ /^-[dym]$/) {
+				$msg = $_[0];
+			} else {
+				($msg = $_[0]) =~ s/-[ydm]//;
+				$msg = "-m'" . $msg . "'";
+			}
+			shift(@_);
+		} else {
+			die "$0 clean: unknown option: $_[0]\n";
+		}
+	}
+
+	@working = &working(@_);
+	for ($i = 0; $i <= $#_; ++$i) {
+		# No working file?
+		if (!-f $working[$i]) {
+			warn "No working file $working[$i] for $_[$i]\n"
+			    if $debug;
+			next;
+		}
+
+		# Read only?  Unlink.
+		if (!-w $working[$i]) {
+			warn "rm $working[$i]\n" unless $s_opt;
+			# Make sure there is an RCS file
+			if (-f $_[$i]) {
+				# XXX - what if ro and edited?
+				unlink($working[$i]) unless $n;
+			} else {
+				warn "clean: no RCS file for $working[$i]\n";
+			}
+			next;
+		}
+
+		# If they just want to know about it, tell them.
+		if ($e_opt == 0) {
+			open(RCS, $_[$i]);
+			while ($r = <RCS>) {
+				last if $r =~ /locks/;
+			}
+			@locks = ();
+			while ($r = <RCS>) {
+				# XXX - I use "comment" a delimiter.
+				last if $r =~ /comment/;
+				$r =~ s/^\s+//;
+				chop($r);
+				push(@locks, $r);
+			}
+			close(RCS);
+			if ($#locks > -1) {
+				warn "$working[$i]: being edited: @locks\n";
+			} else {
+				warn "$working[$i]: " .
+				    "writeable but not edited?!?\n";
+			}
+			next;
+		}
+
+		# See if there have actually been any changes.
+		# Notice that this is cmp(1) in about 10 lines of perl!
+		open(RCS, "co -q -p -kkvl $_[$i] |");
+		open(WORK, $working[$i]);
+		$diff = 0;
+		while ($r = <RCS>) {
+			unless (($w = <WORK>) && ($r eq $w)) {
+				$diff = 1;
+				last;
+			}
+		}
+		if ($w = <WORK>) {
+			$diff = 1;
+		}
+		close(RCS); close(WORK);
+		if ($diff) {
+			if ($f_opt) {
+				warn "Clean modified $working[$i]\n"
+				    unless $s_opt;
+				unless ($n) {
+					unlink($working[$i]);
+					system "rcs -q -u $_[$i]";
+				}
+			} elsif ($d_opt) {
+				push(@checkins, $_[$i]);
+			} else {
+				warn "Can't clean modified $working[$i]\n";
+			}
+			next;
+		} else {
+			warn "rm $working[$i]\n" unless $s_opt;
+			unless ($n) {
+				unlink($working[$i]);
+				system "rcs -q -u $_[$i]";
+			}
+		}
+	}
+
+	# Handle files that needed deltas.
+	if ($#checkins > -1) {
+		warn "ci -q $msg @checkins\n" if $verbose;
+		system "ci -q $msg @checkins";
+	}
+
+	$exit = 0;
+}
+
+# Create - initialize the RCS file
+# -y<c>	- use <c> as the description message for all files.
+# -d<c>	- use <c> as the description message for all files.
+# -g	- don't do the get
+#
+# Differs from sccs in that it does not preserve the original
+# files (I never found that very useful).
+sub create
+{
+	local($arg, $noget, $description, $cmd) = ("", "", "");
+
+	foreach $arg (@_) {
+		# Options...
+		if ($arg =~ /^-[yd]/) {
+			($description = $arg) =~ s/^-[yd]//;
+			$arg = "";
+			warn "Desc: $description\n" if $debug;
+			next;
+		}
+		if ($arg eq "-g") {
+			$noget = "yes";
+			$arg = "";
+			next;
+		}
+		next if ($arg =~ /^-/);
+
+		# If no RCS subdir, make one.
+		if ($arg =~ m|/|) {	# full path
+			($dir = $arg) =~ s|/[^/]+$||;
+			mkdir("$dir/RCS", 0775);
+		} else {		# in $CWD
+			mkdir("RCS", 0775);
+		}
+	}
+	$exit = 0;
+	if ($description ne "") {
+		$cmd = "ci -t-'$description' @_";
+	} else {
+		$cmd = "ci @_";
+	}
+	warn "$cmd\n" if $verbose;
+	system "$cmd";
+	system "co @_" unless $noget;
+}
+
+# Like create without the get.
+sub enter { &create("-g", @_); }
+
+# Edit - get the working file editable
+sub edit { &get("-e", @_); }
+
+# co - normal RCS
+sub co { &get(@_); }
+
+# Get - get the working file
+# -e	Retrieve a version for editing.
+#	Same as co -l.
+# -p    Print the file to stdout.
+# -k	Suppress expansion of ID keywords.
+#	Like co -kk.
+# -s	Suppress all output.
+#
+# Note that all other options are passed to co(1).
+sub get
+{
+	local($arg, $working, $f, $p);
+
+	$f = $p = 0;
+	foreach $arg (@_) {
+		# Options...
+		$arg = "-l" if ($arg eq "-e");
+		$arg = "-kk" if ($arg eq "-k");
+		$arg = "-q" if ($arg eq "-s");
+		$f = 1 if ($arg eq "-f");
+		$p = 1 if ($arg eq "-p");	# XXX - what if -sp?
+
+		next if $arg =~ /^-/ || $p;
+
+		# Check for writable files and skip them unless someone asked
+		# for co's -f option.
+		($working = $arg) =~ s|,v$||;
+		$working =~ s|RCS/||;
+		if ((-w $working) && $f == 0) {
+			warn "ERROR [$arg]: writable `$working' exists.\n";
+			$arg = "";
+		}
+	}
+	@files = grep(/,v/, @_);
+	if ($#files == -1) {
+		warn "$0 $cmd: no files to get. @_\n";
+		$exit = 1;
+	} else {
+		system "co @_";
+		$exit = 0;
+	}
+}
+
+# Aliases for history.
+sub prt { &history(@_); }
+sub prs { &history(@_); }
+
+# History - change history sub command
+sub history
+{
+	local(@history);
+
+	open(RL, "rlog @_|");
+	# Read the whole history
+	while ($r = <RL>) {
+		# Read the history for one file.
+		if ($r !~ /^[=]+$/) {
+			push(@history, $r);
+			next;
+		}
+		&print_history(@history);
+		@history = ();
+	}
+	close(RL);
+	print "+-----------------------------------\n";
+	$exit = 0;
+}
+
+sub print_history
+{
+	for ($i = 0; $i <= $#_; ++$i) {
+		# Get the one time stuff
+		if ($_[$i] =~ /^RCS file:/) {
+			$_[$i] =~ s/RCS file:\s*//;
+			chop($_[$i]);
+			print "+------ $_[$i] -------\n|\n";
+		}
+
+		# Get the history
+		if ($_[$i] =~ /^----------------------------/) {
+			local($rev, $date, $author, $lines) = ("", "", "", "");
+
+			$i++;
+			die "Bad format\n" unless $_[$i] =~ /revision/;
+			$_[$i] =~ s/revision\s+//;
+			chop($_[$i]);
+			$rev = $_[$i];
+			$i++;
+			die "Bad format\n" unless $_[$i] =~ /date/;
+			@parts = split(/[\s\n;]+/, $_[$i]);
+			for ($j = 0; $j <= $#parts; $j++) {
+				if ($parts[$j] =~ /date/) {
+					$j++;
+					$date = "$parts[$j] ";
+					$j++;
+					$date .= "$parts[$j]";
+				}
+				if ($parts[$j] =~ /author/) {
+					$j++;
+					$author = $parts[$j];
+				}
+				if ($parts[$j] =~ /lines/) {
+					$j++;
+					$lines = "$parts[$j] ";
+					$j++;
+					$lines .= "$parts[$j]";
+				}
+			}
+			print "| $rev $date $author $lines\n";
+			while ($_[++$i] &&
+			    $_[$i] !~ /^----------------------------/) {
+			    	print "| $_[$i]"; ### unless $rev =~ /^1\.1$/;
+			}
+			print "|\n";
+			$i--;
+		}
+	}
+}
+
+# Show changes between working file and RCS file
+#
+# -C -> -c for compat with sccs (not sure if this is needed...).
+sub diffs
+{
+	local(@working);
+	local($diff) = "diff";
+	local($rev) = "";
+
+	while ($_[0] =~ /^-/) {
+		if ($_[0] eq "-C") {
+			$diff .= " -c";
+			shift(@_);
+		} elsif ($_[0] =~ /^-r/) {
+			$rev = $_[0];
+			shift(@_);
+		} elsif ($_[0] eq "-sdiff") {
+			# XXX - screen size
+			$diff = "sdiff -w80";
+			shift(@_);
+		} else {
+			$diff .= " $_[0]";
+			shift(@_);
+		}
+			
+	}
+
+	@working = &working(@_);
+	for ($i = 0; $i <= $#_; ++$i) {
+		# No working file?
+		if (!-f $working[$i]) {
+			warn "No working file $working[$i] for $_[$i]\n"
+			    if $debug;
+			next;
+		}
+
+		# Read only?  Skip.
+		next unless (-w $working[$i]);
+
+		# Show the changes
+		print "\n------ $working[$i]$rev ------\n";
+		fflush(stdout);
+		# XXX - flush stdout.
+		if ($diff =~ /^sdiff/) {
+			system "co -q -p -kkvl $rev $_[$i] > /tmp/sdiff.$$" .
+			    "&& $diff /tmp/sdiff.$$ $working[$i]";
+			# XXX - interrupts?
+			unlink("/tmp/sdiff.$$");
+		} else {
+			system "co -q -p -kkvl $rev $_[$i] |" .
+			    " $diff - $working[$i]";
+		}
+	}
+
+	$exit = 0;
+}
+	
+# delta - check in the files
+sub delta
+{
+	local($description) = ("");
+	local($i, @working);
+
+	@working = &working(@_);
+	for ($i = 0; $i <= $#_; ++$i) {
+		# Options...
+		if ($_[$i] =~ /^-[yd]/) {
+			($description = $_[$i]) =~ s/^-[yd]/-m/;
+			$description = "'" . $description . "'";
+			$_[$i] = "";
+			next;
+		}
+		$_[$i] = "-q" if $_[$i] eq "-s";
+		$_[$i] = "" unless -f $working[$i];
+	}
+	$exit = 0;
+	warn "ci $description @_\n" if $verbose;
+	system "ci $description @_";
+}
+
+# Allow RCS interface ci
+sub ci
+{
+	&delta(@_);
+}
+
+# delget
+sub delget
+{
+	&delta(@_);
+	&get(@_);	# If there was a description, delta nuked it...
+}
+
+# deledit
+sub deledit
+{
+	&delta(@_);
+	&get("-e", @_);	# If there was a description, delta nuked it...
+}
+
+
+# info - who is editing what
+sub info
+{
+	local(@working);
+
+	@working = &working(@_);
+	for ($i = 0; $i <= $#_; $i++) {
+		open(RCS, $_[$i]);
+		while ($r = <RCS>) {
+			last if $r =~ /locks/;
+		}
+		@locks = ();
+		while ($r = <RCS>) {
+			# XXX - I use "comment" a delimter.
+			last if $r =~ /comment/;
+			$r =~ s/^\s+//;
+			chop($r);
+			push(@locks, $r);
+		}
+		close(RCS);
+		if ($#locks > -1) {
+			warn "$working[$i]: being edited: @locks\n";
+	    	}
+	}
+	$exit = 0;
+}
+
+# Fix - fix the last change to a file
+sub fix
+{
+	foreach $f (@_) {
+		next unless -f $f;
+		open(F, $f); while (<F>) { last if /head\s\d/; } close(F);
+		unless ($_ && /head/) {
+			warn "$0 $cmd: No head node found in $f\n";
+			next;
+		}
+		s/head\s+//; chop; chop; $rev = $_;
+		($working = $f) =~ s/,v//;
+		$working =~ s|RCS/||;
+		system "co -q $f && rcs -o$rev $f && rcs -l $f && chmod +w $working";
+	}
+	$exit = 0;
+}
+
+# print - print the history and the latest revision of the file
+sub print
+{
+	local($file);
+
+	foreach $file (@_) {
+		&history($file);
+		&get("-s", "-p", $file);
+	}
+	$exit = 0;
+}
+
+
+# Example - example sub command
+# -Q	change this option to -q just to show how.
+sub example
+{
+	local($arg, $working);
+
+	foreach $arg (@_) {
+		# Options...
+		$arg = "-Q" if ($arg eq "-q");
+	}
+	warn "rlog @_\n" if $verbose;
+	system "rlog @_";
+	$exit = 0;
+}
+
+RCS	   bghtml     html-list	 man2html
diff --git a/performance/lmbench3/scripts/TODO b/performance/lmbench3/scripts/TODO
new file mode 100755
index 0000000..c9430db
--- /dev/null
+++ b/performance/lmbench3/scripts/TODO
@@ -0,0 +1,3 @@
+Make graph take a %T and %T2 and put %T above %T2
+
+Or make it take \n in the title and deal.
diff --git a/performance/lmbench3/scripts/allctx b/performance/lmbench3/scripts/allctx
new file mode 100755
index 0000000..386c5e5
--- /dev/null
+++ b/performance/lmbench3/scripts/allctx
@@ -0,0 +1,71 @@
+
+# Extract the context switching information from lmbench result files.
+# Usage: getctx file file....
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: allctx 1.3 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ss $0 "$@"'
+	if 0;
+
+$first = 1;
+foreach $file (@ARGV) {
+	open(FD, $file);
+	$file =~ s|.*/||;
+	$file =~ s/\.\d+//;
+	while (<FD>) {
+		chop;
+		if (/^\[lmbench/) {
+			split;
+			if ($_[3] eq "SunOS") {
+				$_[3] .= "-$_[5]";
+			}
+			$uname = "@_";
+		}
+		if (/Mhz/) {
+			$mhz = $_;
+		}
+		if (/^.size=/) {
+			s/size/Process size/;
+			s/ ovr/\toverhead/;
+			@info = &getinfo($uname, $mhz);
+			($f = $file) =~ s|.*/||;
+			print "\n" unless $first;
+			$first = 0;
+			print "%T $info[3] $info[$#info]Mhz\n";
+			print "$_\n";
+			while (<FD>) {
+				last if /^Null/ || /^Pipe/ || /^Memor/;
+				next if /\$Id/;
+				s/ ovr/\toverhead/;
+				s/size/Process size/;
+			    	print ;
+			}
+			last;
+		}
+	}
+}
+exit 0;  
+
+# Try and create sensible names from uname -a output
+sub getinfo
+{
+	local(@info);
+	local($name);
+	local($mhz) = sprintf("%.0f", $_[1]);
+
+	@info = split(/\s+/, $_[0]);
+	$name = pop(@info);
+	chop($name);
+	if ($name eq "mips") {
+		$name = "$info[$#info]@$mhz";
+	} elsif ($_[0] =~ /HP-UX/) {
+		$name = "$info[7]@$mhz";
+	} elsif ($_[0] =~ /SunOS/) {
+		$name = "$info[7]@$mhz";
+	} else {
+		$name .= "@$mhz";
+	}
+	push(@info, $name);
+	@info;
+}
diff --git a/performance/lmbench3/scripts/allmem b/performance/lmbench3/scripts/allmem
new file mode 100755
index 0000000..9243873
--- /dev/null
+++ b/performance/lmbench3/scripts/allmem
@@ -0,0 +1,69 @@
+
+# Extract the memory latency graph data from lmbench result files.
+# 
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: allmem 1.3 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ss $0 "$@"'
+	if 0;
+
+# Uses a stride of 128
+#print "\"%X Array size\n\"%Y Latency in nanoseconds\n";
+foreach $file (@ARGV) {
+	open(FD, $file);
+	$file =~ s|.*/||;
+	while (<FD>) {
+		chop;
+		if (/^\[lmbench/) {
+			split;
+			if ($_[3] eq "SunOS") {
+				$_[3] .= "-$_[5]";
+			}
+			$uname = "@_";
+		}
+		if (/Mhz/) {
+			$mhz = $_;
+		}
+		if (/^Memory load latency/) {
+			@info = &getinfo($uname, $mhz);
+			($f = $file) =~ s|.*/||;
+			print "\"$file $info[3] $info[$#info]\n";
+			while (<FD>) {
+				next unless /^"stride=128/;
+				last;
+			}
+			while (<FD>) {
+				if (/^\s*$/) {
+					print "\n";
+					last;
+				}
+			    	print;
+			}
+			last;
+		}
+	}
+}
+exit 0;
+
+# Try and create sensible names from uname -a output
+sub getinfo
+{
+	local(@info);
+	local($name);
+	local($mhz) = sprintf("%.0f", $_[1]);
+
+	@info = split(/\s+/, $_[0]);
+	$name = pop(@info);
+	chop($name);
+	if ($name eq "mips") {
+		$name = "$info[$#info]@$mhz";
+	} elsif ($_[0] =~ /HP-UX/) {
+		$name = "$info[7]@$mhz";
+	} elsif ($_[0] =~ /SunOS/) {
+		$name = "$info[7]@$mhz";
+	} else {
+		$name .= "@$mhz";
+	}
+	push(@info, $name);
+	@info;
+}
diff --git a/performance/lmbench3/scripts/bargraph b/performance/lmbench3/scripts/bargraph
new file mode 100755
index 0000000..f710133
--- /dev/null
+++ b/performance/lmbench3/scripts/bargraph
@@ -0,0 +1,430 @@
+# $Id: bargraph 1.5 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ss $0 "$@"'
+	if 0;
+
+# A simple bargraph preprocessor for GNU pic / troff package.
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+#
+# TODO
+#	Make this work with sideways graphs.
+#
+# Input format is:
+#
+#	3 foo bar 
+#	9 bigger foo 
+#	"Silly example
+#
+# and output is
+#
+#                         bigger
+#                          foo
+#                      +----------+
+#                      |          |
+#           foo        |          |
+#           bar        |          |
+#       +----------+   |          |
+#       |          |   |          |
+#       +----------+   +----------+
+#     -------------------------------
+#            3              9
+#
+#             Silly example
+#
+# Input options:
+#	specifier	value					default
+#	%ps		<point size>				10
+#	%ft		<font>					HB
+#	%labelgap	<space in inches between fill labels>	1.5
+#	%xsize		<size of graph width in inches>		7
+#	%ysize		<size of graph height in inches>	6
+#	%Title n|s 	<Bargraph title>			none
+#	%titleplus 	<increase in points of titlesize>	0
+#	%label%d	<label name>				none
+#	%boxpercent	<100% means columns touch>		75
+#	%worse up|down n|w|e|s|nw|ne|sw|se - idiot arrow
+#	%better up|down n|w|e|s|nw|ne|sw|se - idiot arrow
+#	%fakemax	<pretend one data point was this big>
+#
+#	The data can be optionally followed by a %fill%d that gets turned into
+#	the fill value (darkness) for that bar of the bar graph.  The default
+#	fill value is whatever pic defaults to.
+#	The %label control is used to provide a legend for the different fill
+#	values.
+#
+# Command line options:
+#
+#	-big	make the x/y defaults be 7.5 inches, crank up title size, and
+#		don't put a spacer at the top.
+#	-nobox	do not put an outline box around the bargraph.
+#
+#	-sideways
+#		do the bars towards the right.
+#
+# Much thanks to James Clark for providing such a nice replacement for
+# the Unix troff package.  
+
+@lines = <>;	# sluuuuuuuuuuuurp
+$titleplus = 2;
+$bottomplus = 0;
+$fill = "fillval";
+$SP = ".sp 1i";
+$PO = "0i";
+# All of these can be set in the graph with %xxx value
+$ps = 10;
+$ft = "CB";
+$xsize = 4;
+$ysize = 6;
+$boxpercent = 75;
+$labelgap = 1.5;
+if ($nobox) {
+	$invis = "invis";
+} else {
+	$invis = "";
+}
+if ($big) {
+	$slide = 0;
+	$xsize = 7.5;
+	$ysize = 7.5;
+	$SP = "";
+	$titleplus = 4;
+	$bottomplus = 2;
+	# XXX - you may need to screw with this.
+	$xsize -= 3.75 if ($sideways);
+}
+if ($slide) {
+	$big = 0;
+	$xsize = 6.5;
+	$ysize = 4.20;
+	$SP = ".sp .75i";
+	$PO = ".23i";
+	$titleplus = 2;
+	$bottomplus = 0;
+	# XXX - you may need to screw with this.
+	$xsize -= 2.2 if ($sideways);
+}
+
+$vs = $ps + 1;
+
+# Calculate max to autosize the graph.
+foreach $_ (@lines) {
+	next if /^\s*#/;
+	next if /^\s*$/;
+
+	if (/^\s*"/) {
+		($title = $_) =~ s/\s*"//;
+		chop($title);
+		push(@title, "\"\\s+$titleplus$title\\s0\"");
+		next;
+	}
+	if (/^\s*%/) {
+		&control(0);
+		push(@control, $_);
+		next;
+	}
+
+	@_ = split;
+	if (!defined $maxdata) {
+		$maxdata = $_[0];
+	} else {
+		$maxdata = $_[0] if ($maxdata < $_[0]);
+	}
+	push(@data, $_);
+}
+
+foreach $_ (@control) {
+	&control(1);
+}
+
+$n = $#data + 1;
+$tps = $ps + $titleplus;
+$tvs = int($tps * 1.2);
+print <<EOF;
+$SP
+.po $PO
+.ft $ft
+.ps $ps
+.vs $tvs
+.ce 100
+EOF
+foreach $_ (@title_n) {
+	print;
+}
+# Spit out the pic stuff.
+# The idea here is to spit the variables and let pic do most of the math.
+# This allows tweeking of the output by hand.
+print <<EOF;
+.ce 0
+.vs 
+.PS
+.ps $ps
+.vs $vs
+[
+# Variables, tweek these.
+	fillval = .12		# default fill value boxes
+	xsize = $xsize		# width of the graph
+	ysize = $ysize		# height of the graph
+	n = $n
+	boxpercent = $boxpercent / 100
+	gap = xsize / n * (1 - boxpercent)
+	maxdata = $maxdata
+	yscale = ysize / maxdata
+	xscale = xsize / maxdata
+
+# Draw the graph borders 
+	O:	box invis ht ysize wid xsize
+EOF
+# line thick 2 from O.sw - (0, .1) to O.se - (0, .1)
+
+#foreach $_ (@control) {
+#	&control(1);
+#}
+
+#	boxwid = xsize / n * boxpercent
+if ($sideways) {
+	print "boxht = ysize / n * boxpercent\n";
+	# Each data point.
+	for ($i = 0; $i <= $#data; $i++) {
+		$_ = $data[$i];
+		@_ = &getfill;
+		print "box fill $fill wid $_[0] * xscale " .
+		    "with .nw at O.nw - (0, gap /2 + $i * (ysize/n))\n";
+		$value = shift(@_);
+		# XXXXXXX
+		if ($_[$#_] =~ /secs/) {
+			#print "\"@_\" ljust at last box.e + .1,0\n";
+			$units = pop(@_);
+			$each = pop(@_);
+			print "\"\\s+1$value\\s0,  @_,\\  \\s+1$each $units\\s0\" ljust at last box.e + .1,0\n";
+		} else {
+			print "\"\\s+2$value\\s0  @_\" ljust at last box.e + .1,0\n";
+		}
+	}
+} else {
+	print "boxwid = xsize / n * boxpercent\n";
+	# Each data point.
+	for ($i = 0; $i <= $#data; $i++) {
+		$_ = $data[$i];
+		@_ = &getfill;
+		print "box fill $fill ht $_[0] * yscale " .
+		    "with .sw at O.sw + (gap /2 + $i * (xsize/n), 0)\n";
+		$value = shift(@_);
+		@_ = &fmt(@_);
+	#warn "V=$value\nT=@_\n";
+		# Make the bar titles
+		for ($j = $#_; $j >= 0; $j--) {
+			print "\t\"$_[$j]\" at last box.n + (0, .05 + .12 * $j)\n";
+		}
+		print "\t\"\\s+$bottomplus$value\\s0\" at last box.s - (0, .30)\n";
+	}
+
+}
+
+# Labels, if any
+if ($#labels > -1) {
+	print "\n# Labels.\n";
+	print "[\n    boxwid = .35; boxht = .18; y = .10; x = -.03; ";
+	print "labelgap = $labelgap\n";
+	$first = 1;
+	foreach $_ (@labels) {
+		print "    [ B: box fill $_[0]; ";
+		shift(@_);
+		print "\"@_\" ljust at B.e + (y, x) ]";
+		if ($first == 1) {
+			$first = 0;
+			print "\n";
+		} else {
+			print " \\\n\twith .w at last [].e + (labelgap, 0)\n";
+		}
+	}
+	print "] with .nw at O.sw - (0, .6)\n";
+}
+
+$invis = "invis" if $sideways;
+
+print <<EOF;
+]
+box $invis wid last [].wid + .5 ht last [].ht + .5 with .nw at last [].nw + (-.25, .25)
+move to last [].nw + 0,.25
+line thick 2 right 7
+move to last [].sw - 0,.25
+line thick 2 right 7
+.PE
+.ft
+.ps
+.vs
+.po
+EOF
+
+print <<EOF;
+.po .5i
+.ft $ft
+.ps $ps
+.vs $tvs
+.sp .5
+.ce 100
+EOF
+foreach $_ (@title_s) {
+	print;
+}
+print <<EOF;
+.po
+.ft
+.ps
+.vs
+.ce 0
+EOF
+exit 0;
+
+sub fmt
+{
+	local(@args);
+	local(@ret);
+
+	# XXX - this assumes that # is not used anywhere else in the 
+	# label line.
+	$_ = "@_";
+	s/\\ /#/g;
+	@args = split;
+	foreach $_ (@args) {
+		s/#/ /g;
+	}
+	$len = 0;
+	foreach $_ (@args) {
+		$len = length($_) if (length($_) > $len);
+	}
+	$len += 2;
+	$word = shift(@args);
+	while ($#args > -1) {
+		if (length($word) + length($args[0]) < $len) {
+			$word .= " $args[0]";
+			shift(@args);
+		} else {
+			push(@ret, $word);
+			$word = shift(@args);
+		}
+	}
+	push(@ret, $word);
+	reverse(@ret);
+}
+
+# Eat some control information
+#
+sub control
+{
+	local($pass) = $_[0];
+
+	if ($pass == 0) {
+		s/.*%//;
+		chop;
+	}
+	@_ = split;
+	if ($_[0] =~ /[Ww]orse$/ || $_[0] =~ /[Bb]etter$/) {
+		return if ($pass == 0);
+		if ($#_ != 2) {
+			die "bad control: $_\n";
+			return;
+		}
+		($label, $dir, $where) = @_;
+		print "\n# Idiot arrow\n";
+		print "[\tarrow thick 10 wid .5 ht .4 $dir 1.15\n";
+		print "\t\"\\s+9$label\\s0\" ";
+		if ($dir eq "up") {
+		    print "at last arrow.s - (0, .25)\n";
+		} elsif ($dir eq "down") {
+		    print "at last arrow.n + (0, .25)\n";
+		} else {
+			die "bad control: $_\n";
+		}
+		print "] with .$where at O.$where ";
+		if ($where eq "n") {
+			print "- (0, .5)\n";
+		} elsif ($where eq "ne") {
+			print "- (.5, .5)\n";
+		} elsif ($where eq "e") {
+			print "- (.5, 0)\n";
+		} elsif ($where eq "se") {
+			print "- (.5, -.5)\n";
+		} elsif ($where eq "s") {
+			print "+ (0, .5)\n";
+		} elsif ($where eq "sw") {
+			print "+ (.5, .5)\n";
+		} elsif ($where eq "w") {
+			print "+ (.5, 0)\n";
+		} elsif ($where eq "nw") {
+			print "+ (.5, -.5)\n";
+		} else {
+			die "bad control: $_\n";
+		}
+		print "\n";
+	} elsif ($_[0] =~ /Title/) {
+		# XXX - I haven't fixed this for -sideways
+		return if ($pass == 0);
+		if ($_[1] eq "n") {
+			shift(@_); shift(@_);
+			push(@title_n, "\\s+$titleplus@_\\s0\n");
+		} elsif ($_[1] eq "s") {
+			shift(@_); shift(@_);
+			push(@title_s, "\\s+$titleplus@_\\s0\n");
+		} else {
+			die "bad control: $_\n";
+		}
+	} elsif ($_[0] =~ /ps/) {
+		$ps = $_[1];
+	} elsif ($_[0] =~ /ft/) {
+		$ft = $_[1];
+	} elsif ($_[0] =~ /xsize/) {
+		$xsize = $_[1];
+	} elsif ($_[0] =~ /ysize/) {
+		$ysize = $_[1];
+	} elsif ($_[0] =~ /titleplus/) {
+		$titleplus = $_[1];
+	} elsif ($_[0] =~ /boxpercent/) {
+		$boxpercent = $_[1];
+	} elsif ($_[0] =~ /labelgap/) {
+		$labelgap = $_[1];
+	} elsif ($_[0] =~ /label/) {	# has to be after labelgap
+		return if ($pass == 0);
+		$_[0] =~ s/label//;
+		if (length($_[0]) > 0) {
+			$fill = $_[0];
+		} else {
+			$fill = "fillval";
+		}
+		push(@labels, "@_");
+	} elsif ($_[0] =~ /fakemax/) {
+		if (!defined $maxdata) {
+			$maxdata = $_[1];
+		} else {
+			$maxdata = $_[1] if ($maxdata < $_[1]);
+		}
+	} else {
+		die "bad control: $_\n";
+	}
+}
+
+# Look for a %fill[val], eat it, and set $fill
+sub getfill
+{
+	local (@line);
+
+	if (/%fill/) {
+		@_ = split;
+		foreach $_ (@_) {
+			if (/%fill/) {
+				s/%fill//;
+				if (length($_) > 0) {
+					$fill = $_;
+				} else {
+					$fill = "fillval";
+				}
+			} else {
+				push(@line, $_);
+			}
+		}
+	} else {
+		$fill = "fillval";
+		@line = split;
+	}
+	@line;
+}
diff --git a/performance/lmbench3/scripts/bghtml b/performance/lmbench3/scripts/bghtml
new file mode 100755
index 0000000..5e01f0a
--- /dev/null
+++ b/performance/lmbench3/scripts/bghtml
@@ -0,0 +1,39 @@
+
+# Make HTML files that will point to the right GIF files.
+# Usage: bghtml file file file....
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1995 Larry McVoy.  GPLed software.
+# $Id: bghtml 1.2 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ss $0 "$@"'
+	if 0;
+
+$bar = 0;
+for ($i = 0; $i <= $#ARGV; ++$i) {
+	$file = $ARGV[$i]; $file =~ s|tmp/||; $file =~ s|.bg$||;
+	if ($i > 0) {
+		$prev = $ARGV[$i - 1];
+		$prev =~ s|tmp/||;
+		$prev =~ s|.bg$||;
+		$prev_html = "${prev}.html";
+	}
+	if ($i < $#ARGV) {
+		$next = $ARGV[$i + 1];
+		$next =~ s|tmp/||;
+		$next =~ s|.bg$||;
+		$next_html = "${next}.html";
+	}
+	$name = "HTML/${file}.html";
+	open(F, ">$name");
+	print F "<a href=${file}.8>Man page for this benchmark</a><p>\n";
+	$str = sprintf("<IMG SRC=\"bar%02d\">\n", ++$bar);
+	print F "$str<p>";
+	print F "<a href=lmbench-toc.html><img src=\"gifs/arrows/b_arrow.gif\"</a>\n";
+	print F "<a href=lmbench-S-6.html><img src=\"gifs/graph.gif\"</a>\n";
+	print F "<a href=${prev_html}><img src=\"gifs/arrows/back.gif\"</a>\n"
+		if $i > 0;
+	print F "<a href=${next_html}><img src=\"gifs/arrows/forward.gif\"</a>\n"
+		if $i < $#ARGV;
+	close(F);
+}
+exit 0;
diff --git a/performance/lmbench3/scripts/build b/performance/lmbench3/scripts/build
new file mode 100755
index 0000000..16a6600
--- /dev/null
+++ b/performance/lmbench3/scripts/build
@@ -0,0 +1,252 @@
+#!/bin/sh
+
+CC=${CC-`../scripts/compiler`}
+MAKE=${MAKE-`../scripts/make`}
+OS=${OS-`../scripts/os`}
+TARGET=${TARGET-`../scripts/target`}
+BINDIR=../bin/"${OS}"
+CONFIG=../bin/"${OS}"/`../scripts/config`
+NULL=/dev/null
+
+BASE=/tmp/dummy
+for t in /usr/tmp /var/tmp /tmp; do
+	if [ -d $t -a -w $t ]
+	then	BASE=${t}/dummy
+		break
+	fi
+done
+
+trap 'rm -f ${BASE}$$.s ${BASE}$$.c ${BASE}$$.o ${BASE}$$; exit 1' 1 2 15
+
+LDLIBS=-lm
+
+# check for HP-UX's ANSI compiler
+echo "main(int ac, char *av[]) { int i; }" > ${BASE}$$.c
+if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}
+then
+	true;
+else
+	rm -f ${BASE}$$
+	if ${CC} ${CFLAGS} -Ae -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}
+	then
+		CFLAGS="${CFLAGS} -Ae"
+	fi
+fi
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+	
+# check for IA64 HP-UX w/ HP's ANSI compiler; may need pointer swizzling
+arch=`echo $OS | awk -F- '{print $1;}'`
+if [ X$CC = "Xcc" -a X$arch = "Xia64" ]
+then
+	echo "#include <stdlib.h>" > ${BASE}$$.c
+	echo "main(int ac, char *av[])" >> ${BASE}$$.c
+	echo "{ long* p = (long*)malloc(sizeof(long));" >> ${BASE}$$.c
+	echo "*p = 0; exit((int)*p); }" >> ${BASE}$$.c
+	${CC} ${CFLAGS} +DD64 -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+		&& [ -x ${BASE}$$ ] \
+		&& ${BASE}$$ \
+		&& CFLAGS="${CFLAGS} +DD64"
+	rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+fi
+	
+# check for bcopy (optionally set the SYS5 flag)
+echo "#include <string.h>" > ${BASE}$$.c
+echo "main() { char a[256], b[256]; bcopy(a, b, 256); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	|| CFLAGS="${CFLAGS} -DSYS5"
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check for valloc
+echo "#include <stdlib.h>" > ${BASE}$$.c
+echo "main() { char* buf = valloc(123); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	|| CFLAGS="${CFLAGS} -Dvalloc=malloc"
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check for getrusage
+echo "#include <sys/types.h>" > ${BASE}$$.c
+echo "#include <sys/time.h>" >> ${BASE}$$.c
+echo "#include <sys/resource.h>" >> ${BASE}$$.c
+echo "#ifndef RUSAGE_SELF" >> ${BASE}$$.c
+echo "#define RUSAGE_SELF 0" >> ${BASE}$$.c
+echo "#endif /* RUSAGE_SELF */" >> ${BASE}$$.c
+echo "main() { struct rusage ru; getrusage(RUSAGE_SELF, &ru); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DRUSAGE"
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check for -lnsl
+echo "extern int pmap_getport(); main() { pmap_getport(); }" > ${BASE}$$.c
+if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then
+	true;
+else
+	${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c -lnsl 1>${NULL} 2>${NULL} \
+		&& LDLIBS="${LDLIBS} -lnsl"
+fi
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+
+# check for -lsocket
+echo "extern void* getservent(); main() { getservent(); }" > ${BASE}$$.c
+if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then
+	true;
+else
+	${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c -lsocket 1>${NULL} 2>${NULL} \
+		&& LDLIBS="${LDLIBS} -lsocket"
+fi
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check for -lrt (solaris)
+echo "extern int nanosleep(); main() { nanosleep(); }" >${BASE}$$.c
+if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then
+       true;
+else
+       ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c -lrt 1>${NULL} 2>${NULL} \
+               && LDLIBS="${LDLIBS} -lrt"
+fi
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check for -lrpc (cygwin/Windows)
+echo "extern int pmap_set(); main() { pmap_set(); }" >${BASE}$$.c
+if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then
+       true;
+else
+       ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c -lrpc 1>${NULL} 2>${NULL} \
+               && LDLIBS="${LDLIBS} -lrpc"
+fi
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check for OSs that have S_IFFIFO instead of S_IFIFO
+echo "#include <sys/stat.h>" > ${BASE}$$.c
+echo "main() { return (S_IFIFO); }" >> ${BASE}$$.c
+if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then
+	true;
+else
+	rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+	echo "#include <sys/stat.h>" > ${BASE}$$.c
+	echo "main() { return (S_IFFIFO); }" >> ${BASE}$$.c
+	${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+		|| CFLAGS="${CFLAGS} -DS_IFIFO=S_IFFIFO"
+fi
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check that we have uint
+echo "#include <stdlib.h>" > ${BASE}$$.c
+echo "#include <sys/types.h>" >> ${BASE}$$.c
+echo "main() { uint i = 0; return (i); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DHAVE_uint=1";
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check that we have uint64
+HAVE_uint64=0
+echo "#include <stdlib.h>" > ${BASE}$$.c
+echo "#include <sys/types.h>" >> ${BASE}$$.c
+echo "#include <rpc/types.h>" >> ${BASE}$$.c
+echo "main() { uint64 i = 0; return (int)(i); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DHAVE_uint64=1" && HAVE_uint64=1;
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check that we have uint64_t
+if [ ${HAVE_uint64} = 0 ]; then
+    echo "#include <stdlib.h>" > ${BASE}$$.c
+    echo "#include <sys/types.h>" >> ${BASE}$$.c
+    echo "main() { uint64_t i = 0; return (int)(i); }" >> ${BASE}$$.c
+    ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DHAVE_uint64_t=1";
+    rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+fi
+
+# check that we have int64
+HAVE_int64=0
+echo "#include <stdlib.h>" > ${BASE}$$.c
+echo "#include <sys/types.h>" >> ${BASE}$$.c
+echo "#include <rpc/types.h>" >> ${BASE}$$.c
+echo "main() { int64 i = 0; return (int)(i); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DHAVE_int64=1" && HAVE_int64=1;
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check that we have int64_t
+if [ ${HAVE_int64} = 0 ]; then
+    echo "#include <stdlib.h>" > ${BASE}$$.c
+    echo "#include <sys/types.h>" >> ${BASE}$$.c
+    echo "main() { int64_t i = 0; return (int)(i); }" >> ${BASE}$$.c
+    ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DHAVE_int64_t=1";
+    rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+fi
+
+# check that we have drand48 and srand48
+HAVE_RANDOM=0
+echo "#include <stdlib.h>" > ${BASE}$$.c
+echo "main() { srand48(973); return (int)(1.0E9 * drand48()); }" >> ${BASE}$$.c
+if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then
+	CFLAGS="${CFLAGS} -DHAVE_DRAND48"
+	HAVE_RANDOM=1
+fi
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+if [ ${HAVE_RANDOM} -eq 0 ]; then
+    echo "#include <stdlib.h>" > ${BASE}$$.c
+    echo "main() { srand(973); return (10 * rand()) / RAND_MAX; }" >> ${BASE}$$.c
+    if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then
+	CFLAGS="${CFLAGS} -DHAVE_RAND"
+	HAVE_RANDOM=1
+    fi
+    rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+fi
+
+if [ ${HAVE_RANDOM} -eq 0 ]; then
+    echo "#include <stdlib.h>" > ${BASE}$$.c
+    echo "main() { srandom(973); return (10 * random()) / RAND_MAX; }" >> ${BASE}$$.c
+    if ${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL}; then
+	CFLAGS="${CFLAGS} -DHAVE_RANDOM"
+	HAVE_RANDOM=1
+    fi
+    rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+fi
+
+# check that we have sysmp
+echo "#include <sys/types.h>" > ${BASE}$$.c
+echo "#include <sys/sysmp.h>" >> ${BASE}$$.c
+echo "main() { return (int)sysmp(MP_NPROCS); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DHAVE_SYSMP=1";
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check that we have bindprocessor
+echo "#include <stdlib.h>" > ${BASE}$$.c
+echo "#include <unistd.h>" >> ${BASE}$$.c
+echo "#include <sys/types.h>" >> ${BASE}$$.c
+echo "#include <sys/processor.h>" >> ${BASE}$$.c
+echo "main() { return bindprocessor(BINDPROCESS, getpid(), 0); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DHAVE_BINDPROCESSOR=1";
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check that we have processor_bind
+echo "#include <stdlib.h>" > ${BASE}$$.c
+echo "#include <sys/types.h>" >> ${BASE}$$.c
+echo "#include <sys/processor.h>" >> ${BASE}$$.c
+echo "#include <sys/procset.h>" >> ${BASE}$$.c
+echo "main() { return processor(P_PID, P_MYPID, 0, NULL); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DHAVE_BINDPROCESSOR=1";
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+# check that we have sched_setaffinity
+echo "#include <stdlib.h>" > ${BASE}$$.c
+echo "#include <unistd.h>" >> ${BASE}$$.c
+echo "#include <sched.h>" >> ${BASE}$$.c
+echo "main() { unsigned long mask = 1; return sched_setaffinity(0, sizeof(unsigned long), &mask); }" >> ${BASE}$$.c
+${CC} ${CFLAGS} -o ${BASE}$$ ${BASE}$$.c 1>${NULL} 2>${NULL} \
+	&& CFLAGS="${CFLAGS} -DHAVE_SCHED_SETAFFINITY=1";
+rm -f ${BASE}$$ ${BASE}$$.o ${BASE}$$.c
+
+
+if [ ! -d ${BINDIR} ]; then mkdir -p ${BINDIR}; fi
+
+# now go ahead and build everything!
+${MAKE} OS="${OS}" CC="${CC}" CFLAGS="${CFLAGS}" LDLIBS="${LDLIBS}" O="${BINDIR}" $*
diff --git a/performance/lmbench3/scripts/compiler b/performance/lmbench3/scripts/compiler
new file mode 100755
index 0000000..2fca921
--- /dev/null
+++ b/performance/lmbench3/scripts/compiler
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+if [ "X$CC" != "X" ] && echo "$CC" | grep -q '`'
+then
+    CC=
+fi
+
+if [ X$CC = X ]
+then	CC=cc
+	for p in `echo $PATH | sed 's/:/ /g'`
+	do	if [ -f $p/gcc ]
+		then	CC=gcc
+		fi
+	done
+fi
+echo $CC
diff --git a/performance/lmbench3/scripts/config b/performance/lmbench3/scripts/config
new file mode 100755
index 0000000..b58cb60
--- /dev/null
+++ b/performance/lmbench3/scripts/config
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+UNAME=`uname -n 2>/dev/null`
+if [ X$UNAME = X ]
+then	echo CONFIG
+else	echo CONFIG.$UNAME
+fi
diff --git a/performance/lmbench3/scripts/config-run b/performance/lmbench3/scripts/config-run
new file mode 100755
index 0000000..40217d4
--- /dev/null
+++ b/performance/lmbench3/scripts/config-run
@@ -0,0 +1,783 @@
+#!/bin/sh
+
+# Configure parameters for lmbench.
+# %I% %E% %@%
+
+OS=`../scripts/os`
+L='====================================================================='
+echo $L; 
+cat<<EOF;
+
+		L M B E N C H   C ON F I G U R A T I O N
+		----------------------------------------
+
+You need to configure some parameters to lmbench.  Once you have configured
+these parameters, you may do multiple runs by saying
+
+	"make rerun"
+
+in the src subdirectory.
+
+NOTICE: please do not have any other activity on the system if you can
+help it.  Things like the second hand on your xclock or X perfmeters
+are not so good when benchmarking.  In fact, X is not so good when
+benchmarking.
+
+EOF
+
+# Figure out echo.
+if [ `echo -n "foo" | wc -l` -eq 0 ]
+then	ECHON="-n"; ECHOC=
+else	ECHON= ; ECHOC='\c'
+fi
+
+############################################################################
+# Timing granulairty, loop overhead, etc.
+############################################################################
+echo $L; echo "";
+echo "Hang on, we are calculating your timing granularity."
+../bin/$OS/msleep 250
+ENOUGH=`../bin/$OS/enough`
+export ENOUGH 
+echo "OK, it looks like you can time stuff down to $ENOUGH usec resolution."
+echo ""
+echo "Hang on, we are calculating your timing overhead."
+../bin/$OS/msleep 250
+TIMING_O=`../bin/$OS/timing_o`
+export TIMING_O
+echo "OK, it looks like your gettimeofday() costs $TIMING_O usecs."
+echo ""
+echo "Hang on, we are calculating your loop overhead."
+../bin/$OS/msleep 250
+LOOP_O=`../bin/$OS/loop_o`
+export LOOP_O
+echo "OK, it looks like your benchmark loop costs $LOOP_O usecs."
+echo ""
+############################################################################
+# Multiple copies
+############################################################################
+echo $L
+cat<<EOF;
+
+If you are running on an MP machine and you want to try running
+multiple copies of lmbench in parallel, you can specify how many here.
+
+Using this option will make the benchmark run 100x slower (sorry).
+
+NOTE:  WARNING! This feature is experimental and many results are 
+	known to be incorrect or random!
+
+EOF
+AGAIN=Y
+while [ $AGAIN = Y ]
+do	echo $ECHON "MULTIPLE COPIES [default 1] $ECHOC"
+#	read SYNC_MAX
+	if [ "X$SYNC_MAX" != X ]
+	then	case "$SYNC_MAX" in
+		[0-9]|[0-9][0-9]|[0-9][0-9][0-9])
+			AGAIN=N
+			;;
+		*)	echo "Please enter a number between 1 and 999"
+			;;
+		esac
+	else	AGAIN=N
+		SYNC_MAX=1
+	fi
+done
+
+LMBENCH_SCHED=DEFAULT
+AGAIN=Y
+while [ $AGAIN = Y ]
+do	cat<<EOF 
+Options to control job placement
+1) Allow scheduler to place jobs
+2) Assign each benchmark process with any attendent child processes
+   to its own processor
+3) Assign each benchmark process with any attendent child processes
+   to its own processor, except that it will be as far as possible
+   from other processes
+4) Assign each benchmark and attendent processes to their own
+   processors
+5) Assign each benchmark and attendent processes to their own
+   processors, except that they will be as far as possible from
+   each other and other processes
+6) Custom placement: you assign each benchmark process with attendent
+   child processes to processors
+7) Custom placement: you assign each benchmark and attendent
+   processes to processors
+
+Note: some benchmarks, such as bw_pipe, create attendent child
+processes for each benchmark process.  For example, bw_pipe
+needs a second process to send data down the pipe to be read
+by the benchmark process.  If you have three copies of the
+benchmark process running, then you actually have six processes;
+three attendent child processes sending data down the pipes and 
+three benchmark processes reading data and doing the measurements.
+
+EOF
+	echo $ECHON "Job placement selection: $ECHOC"
+#	read LMBENCH_SCHED
+LMBENCH_SCHED=1
+	AGAIN=N
+	case "$LMBENCH_SCHED" in
+	    1) LMBENCH_SCHED=DEFAULT;;
+	    2) LMBENCH_SCHED=BALANCED;;
+	    3) LMBENCH_SCHED=BALANCED_SPREAD;;
+	    4) LMBENCH_SCHED=UNIQUE;;
+	    5) LMBENCH_SCHED=UNIQUE_SPREAD;;
+	    6) echo $ECHON "Please enter a space-separated list of CPU ids: $ECHOC"
+	       read LMBENCH_SCHED
+	       LMBENCH_SCHED="CUSTOM $LMBENCH_SCHED"
+	       ;;
+	    7) echo $ECHON "Please enter a space-separated list of CPU ids: $ECHOC"
+	       read LMBENCH_SCHED
+	       LMBENCH_SCHED="CUSTOM_SPREAD $LMBENCH_SCHED"
+	       ;;
+	    *) AGAIN=Y
+	       ;;
+	esac
+done
+
+############################################################################
+# Figure out memory size.
+############################################################################
+if [ -r /proc/cpuinfo ]
+then
+	PROCESSORS=`grep processor /proc/cpuinfo | wc -l`
+fi
+
+if [ -r /proc/meminfo ]
+then
+	TMP=`grep 'MemTotal:' /proc/meminfo | awk '{print $2}'`
+	if [ X$TMP != X ]
+	then	MB=`echo $TMP / 1024 | bc 2>/dev/null`
+		if [ X$MB = X ]
+		then	MB=`expr $TMP / 1024 2>/dev/null`
+		fi
+	fi
+	TMP=`grep 'Mem:' /proc/meminfo | awk '{print $2}'`
+	if [ X$MB = X -a X$TMP != X ]
+	then	MB=`echo $TMP / 1048576 | bc 2>/dev/null`
+		if [ X$MB = X ]
+		then	MB=`expr $TMP / 1048576 2>/dev/null`
+		fi
+	fi
+fi
+if [ X$MB = X ]
+then	$ECHON "Probing system for available memory: $ECHOC"
+	MB=`../bin/$OS/memsize 4096`
+fi
+TOTAL_MEM=$MB
+MB=`echo \( $MB \* 7 \) / 10 | bc 2>/dev/null`
+if [ X$MB = X ]
+then	MB=`expr $TOTAL_MEM \* 7`
+	MB=`expr $MB / 10`
+fi
+
+echo $L
+cat<<EOF;
+
+Several benchmarks operate on a range of memory.  This memory should be
+sized such that it is at least 4 times as big as the external cache[s]
+on your system.   It should be no more than 80% of your physical memory.
+
+The bigger the range, the more accurate the results, but larger sizes
+take somewhat longer to run the benchmark.
+
+EOF
+echo $ECHON "MB [default $MB] $ECHOC"
+#read TMP
+if [ X$TMP != X ]
+then	MB=$TMP
+fi
+# Certain machines tend to barf when you try and bcopy 8MB.
+# Figure out how much we can use.
+echo "Checking to see if you have $MB MB; please wait for a moment..."
+MB=`../bin/$OS/memsize $MB`
+MB=`../bin/$OS/memsize $MB`
+MB=`../bin/$OS/memsize $MB`
+if [ `expr $SYNC_MAX \* $MB` -gt `expr $TOTAL_MEM` ]
+then
+	MB=`expr $TOTAL_MEM / $SYNC_MAX`
+	MB=`expr $MB / 2`
+fi
+if [ $MB -lt 8 ]
+then    echo $0 aborted: Not enough memory, only ${MB}MB available. 
+	exit 1
+fi
+if [ $MB -lt 16 ]
+then	echo Warning: you have only ${MB}MB available memory. 
+	echo Some benchmark results will be less meaningful. 
+fi
+
+echo "Hang on, we are calculating your cache line size."
+../bin/$OS/msleep 250
+LINE_SIZE=`../bin/$OS/line -M ${MB}M`
+export LINE_SIZE
+echo "OK, it looks like your cache line is $LINE_SIZE bytes."
+echo ""
+
+############################################################################
+# Benchmarking subsets
+############################################################################
+echo $L
+cat<<EOF;
+
+lmbench measures a wide variety of system performance, and the full suite
+of benchmarks can take a long time on some platforms.  Consequently, we
+offer the capability to run only predefined subsets of benchmarks, one
+for operating system specific benchmarks and one for hardware specific
+benchmarks.  We also offer the option of running only selected benchmarks
+which is useful during operating system development.
+
+Please remember that if you intend to publish the results you either need
+to do a full run or one of the predefined OS or hardware subsets.
+
+EOF
+
+echo $ECHON "SUBSET (ALL|HARWARE|OS|DEVELOPMENT) [default all] $ECHOC"
+#read subset
+subset=O
+BENCHMARK_HARDWARE=NO
+BENCHMARK_OS=NO
+BENCHMARK_DEVELOPMENT=NO
+case "$subset" in
+	[hH]*)	BENCHMARK_HARDWARE=YES;;
+	[oO]*)	BENCHMARK_OS=YES;;
+	[dD]*)	BENCHMARK_DEVELOPMENT=YES;;
+	*)	BENCHMARK_HARDWARE=YES;
+		BENCHMARK_OS=YES;;
+esac
+
+if [ X$BENCHMARK_DEVELOPMENT = XYES ]; then
+	echo $L
+
+	echo $ECHON "SYSCALL [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_SYSCALL=NO;;
+	    *)		BENCHMARK_SYSCALL=YES;;
+	esac
+
+	echo $ECHON "SELECT [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_SELECT=NO;;
+	    *) 		BENCHMARK_SELECT=YES;;
+	esac
+
+	echo $ECHON "PROCESS CREATION [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_PROC=NO;;
+	    *)		BENCHMARK_PROC=YES;;
+	esac
+
+	echo $ECHON "PAGEFAULT [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_PAGEFAULT=NO;;
+	    *)		BENCHMARK_PAGEFAULT=YES;;
+	esac
+
+	echo $ECHON "FILE [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_FILE=NO;;
+	    *)		BENCHMARK_FILE=YES;;
+	esac
+
+	echo $ECHON "MMAP [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_MMAP=NO;;
+	    *)		BENCHMARK_MMAP=YES;;
+	esac
+
+	echo $ECHON "CONTEXT SWITCH [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_CTX=NO;;
+	    *)		BENCHMARK_CTX=YES;;
+	esac
+
+	echo $ECHON "PIPE [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_PIPE=NO;;
+	    *)		BENCHMARK_PIPE=YES;;
+	esac
+
+	echo $ECHON "UNIX socket [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_UNIX=NO;;
+	    *)		BENCHMARK_UNIX=YES;;
+	esac
+
+	echo $ECHON "UDP [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_UDP=NO;;
+	    *)		BENCHMARK_UDP=YES;;
+	esac
+
+	echo $ECHON "TCP [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_TCP=NO;;
+	    *)		BENCHMARK_TCP=YES;;
+	esac
+
+	echo $ECHON "TCP CONNECT [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_CONNECT=NO;;
+	    *)		BENCHMARK_CONNECT=YES;;
+	esac
+
+	echo $ECHON "RPC [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_RPC=NO;;
+	    *)		BENCHMARK_RPC=YES;;
+	esac
+
+	echo $ECHON "HTTP [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_HTTP=NO;;
+	    *)		BENCHMARK_HTTP=YES;;
+	esac
+
+	echo $ECHON "BCOPY [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_BCOPY=NO;;
+	    *)		BENCHMARK_BCOPY=YES;;
+	esac
+
+	echo $ECHON "MEMORY HIERARCHY [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_MEM=NO;;
+	    *)		BENCHMARK_MEM=YES;;
+	esac
+
+	echo $ECHON "CPU OPERATIONS [default yes] $ECHOC"
+	read bench
+	case "$bench" in
+	    [nN]*)	BENCHMARK_OPS=NO;;
+	    *)		BENCHMARK_OPS=YES;;
+	esac
+fi
+
+############################################################################
+# Memory strides for lat_mem
+############################################################################
+FASTMEM=NO
+if [ "$BENCHMARK_HARDWARE" = "YES" ]; then
+	echo $L
+	cat<<EOF;
+
+This benchmark measures, by default, memory latency for a number of
+different strides.  That can take a long time and is most useful if you
+are trying to figure out your cache line size or if your cache line size
+is greater than 128 bytes.
+
+If you are planning on sending in these results, please don't do a fast
+run.
+
+Answering yes means that we measure memory latency with a 128 byte stride.  
+
+EOF
+
+	echo $ECHON "FASTMEM [default no] $ECHOC"
+	read fast
+	case "$fast" in
+	    [yY]*) FASTMEM=YES;;
+	    *)	   FASTMEM=NO;;
+	esac
+fi
+
+############################################################################
+# File system latency
+############################################################################
+echo $L
+cat<<EOF;
+
+This benchmark measures, by default, file system latency.  That can
+take a long time on systems with old style file systems (i.e., UFS,
+FFS, etc.).  Linux' ext2fs and Sun's tmpfs are fast enough that this
+test is not painful.
+
+If you are planning on sending in these results, please don't do a fast
+run.
+
+If you want to skip the file system latency tests, answer "yes" below.
+
+EOF
+
+echo $ECHON "SLOWFS [default no] $ECHOC"
+#read slow
+slow=n
+case "$slow" in
+    [yY]*) SLOWFS=YES;;
+    *)	   SLOWFS=NO;;
+esac
+
+############################################################################
+# Disk bandwidth/seek times
+############################################################################
+if [ $SYNC_MAX -gt 1 -o "${BENCHMARK_HARDWARE}" != "YES" ]; then
+	# parallel benchmarking is incompatible with disk tests
+	DISK_DESC=""
+	DISKS=""
+else
+	echo $L
+	cat<<EOF;
+
+This benchmark can measure disk zone bandwidths and seek times.  These can
+be turned into whizzy graphs that pretty much tell you everything you might
+need to know about the performance of your disk.  
+
+This takes a while and requires read access to a disk drive.  
+Write is not measured, see disk.c to see how if you want to do so.
+
+If you want to skip the disk tests, hit return below.
+
+If you want to include disk tests, then specify the path to the disk
+device, such as /dev/sda.  For each disk that is readable, you'll be
+prompted for a one line description of the drive, i.e., 
+
+	Iomega IDE ZIP
+or
+	HP C3725S 2GB on 10MB/sec NCR SCSI bus
+
+EOF
+
+	echo $ECHON "DISKS [default none] $ECHOC"
+	read disks
+	if [ X"$disks" != X ]
+	then	
+		for i in $disks
+		do	if [ -r $i ]
+			then	../bin/$OS/flushdisk $i
+				if [ $? -eq 1 ]
+				then	echo "Must be root to run disk benchmarks."
+					echo "Root is needed to flush the buffer cache"
+					exit 1
+				fi
+				echo $ECHON "$i is a $ECHOC"
+				read x
+				DISK_DESC="$DISK_DESC[${i}:${x}] "
+				DISKS="$DISKS${i} "
+			else	echo "Can't read $i, skipping it."
+			fi
+		done
+	fi
+fi
+
+############################################################################
+# Remote networking
+############################################################################
+if [ $SYNC_MAX -gt 1 ]; then
+	# remote networking is incompatible with parallel benchmarking
+	REMOTE=""
+else
+	echo $L
+
+	RSH=rsh
+	for p in `echo $PATH | sed 's/:/ /g'`
+	do	if [ -f $p/remsh ]
+		then	RSH=remsh
+		fi
+	done
+	RCP=rcp
+
+	cat<<EOF;
+
+If you are running on an idle network and there are other, identically
+configured systems, on the same wire (no gateway between you and them),
+and you have rsh access to them, then you should run the network part
+of the benchmarks to them.  Please specify any such systems as a space
+separated list such as: ether-host fddi-host hippi-host.
+
+EOF
+	echo $ECHON "REMOTE [default none] $ECHOC"
+#	read REMOTE
+	if [ "X$REMOTE" != X ]
+	then	cat<<EOF;
+
+Thanks for doing remote testing, that is a hard thing to get.  In 
+order to run a server on the remote system, we need a remote shell 
+to be enabled (ideally without a password) from this host to $REMOTE.  
+The original remote shell is rsh, but the use of a secure remote shell 
+like ssh is increasingly common.  We need the name of BOTH the shell 
+itself and the associated copy tool (e.g. rcp vs scp) to be entered.
+
+EOF
+		echo $ECHON "RSH [default $RSH] $ECHOC"
+		read rsh
+		if [ -n "$rsh" ]
+		then	RSH=$rsh
+		fi
+		echo $ECHON "RCP [default $RCP] $ECHOC"
+		read rcp
+		if [ -n "$rsh" ]
+		then	RCP=$rcp
+		fi
+
+		cat<<EOF;
+
+Could you do me one more favor and tell me the networking you think 
+will be used to get to each of the remote hosts.  By networking I 
+mean one of the following (or whatever you use if you use something 
+else):
+
+ethernet		aka 10baseT, thinnet, thicknet, etc
+ethernet-100		aka 100baseT, 100VG
+fddi			aka cddi
+hippi
+others?
+
+Please type it just like the above if you can, it makes parsing easier.
+
+EOF
+
+
+		for r in $REMOTE
+		do	echo $ECHON "Network type for $r: $ECHOC"
+			read n
+			X=`$RSH $r echo foo`
+			if [ X$X = Xfoo ]
+			then	echo Remote access to $r worked, thanks.
+			else	echo Remote access to $r did not work, please check and retry,
+				exit 1
+			fi
+			NETWORKS="${NETWORKS}[ $r:$n ]"
+		done
+	fi
+fi
+
+############################################################################
+# Processor speed
+############################################################################
+echo $L
+echo ""
+echo "Calculating mhz, please wait for a moment..."
+MHZ=`../bin/$OS/mhz`
+cat<<EOF
+I think your CPU mhz is 
+
+	$MHZ
+	
+but I am frequently wrong.  If that is the wrong Mhz, type in your
+best guess as to your processor speed.  It doesn't have to be exact,
+but if you know it is around 800, say 800.  
+
+Please note that some processors, such as the P4, have a core which
+is double-clocked, so on those processors the reported clock speed
+will be roughly double the advertised clock rate.  For example, a
+1.8GHz P4 may be reported as a 3592MHz processor.
+
+EOF
+echo $ECHON "Processor mhz [default $MHZ] $ECHOC"
+#read mhz
+if [ -n "$mhz" ]
+then	MHZ=$mhz
+fi
+
+
+############################################################################
+# /usr/tmp?
+############################################################################
+echo $L
+AGAIN=Y
+while [ $AGAIN = Y ]
+do
+	cat<<EOF;
+
+We need a place to store a $MB Mbyte file as well as create and delete a
+large number of small files.  We default to /usr/tmp.  If /usr/tmp is a
+memory resident file system (i.e., tmpfs), pick a different place.
+Please specify a directory that has enough space and is a local file
+system.
+
+EOF
+	DEFAULTFSDIR=/usr/tmp
+	for t in /usr/tmp /var/tmp /tmp; do
+		if [ -d $t -a -w $t ]
+		then	DEFAULTFSDIR=$t
+			break
+		fi
+	done
+	echo $ECHON "FSDIR [default $DEFAULTFSDIR] $ECHOC"
+	#read FSDIR
+	if [ X$FSDIR = X ]
+	then	FSDIR=$DEFAULTFSDIR
+	else	mkdir -p $FSDIR 2>/dev/null
+	fi
+	if [ -d $FSDIR -a -w $FSDIR ]
+	then	AGAIN=N
+		FILE=$FSDIR/XXX
+	else	echo $FSDIR is not a directory or is not writable
+	fi
+done
+
+############################################################################
+# status output?
+############################################################################
+echo $L
+cat<<EOF;
+
+lmbench outputs status information as it runs various benchmarks.
+By default this output is sent to /dev/tty, but you may redirect
+it to any file you wish (such as /dev/null...).
+
+EOF
+
+echo $ECHON "Status output file [default /dev/tty] $ECHOC"
+#read OUTPUT
+if [ "X$OUTPUT" = X ]
+then	OUTPUT=/dev/tty;
+fi
+
+############################################################################
+# Submit results?
+############################################################################
+echo $L
+cat<<EOF;
+
+There is a database of benchmark results that is shipped with new
+releases of lmbench.  Your results can be included in the database
+if you wish.  The more results the better, especially if they include
+remote networking.  If your results are interesting, i.e., for a new
+fast box, they may be made available on the lmbench web page, which is
+
+	http://www.bitmover.com/lmbench
+
+EOF
+
+echo $ECHON "Mail results [default yes] $ECHOC"
+#read MAIL
+MAIL=n
+case $MAIL in 
+    [Nn]*)	MAIL=no
+		echo OK, no results mailed.
+		;;
+    *)		MAIL=yes
+		;;
+esac
+
+INFO=`../scripts/info`
+if [ $MAIL = yes ]
+then	if [ ! -f ../bin/$OS/$INFO ]
+	then	cp ../scripts/info-template ../bin/$OS/$INFO
+		chmod +w ../bin/$OS/$INFO
+		REUSE=no
+	else	
+		REUSE=view
+		while [ $REUSE = view ]
+		do	echo ""
+			echo $ECHON \
+"Reuse previous description [default yes, other options: no|view] $ECHOC"
+			read REUSE
+			case $REUSE in 
+	    		[Nn]*)	REUSE=no
+				;;
+			[Vv]*)	REUSE=view
+				echo $L
+				more ../bin/$OS/$INFO
+				echo $L
+				;;
+	    		*)	REUSE=yes
+				;;
+			esac
+		done
+	fi
+
+	if [ $REUSE = no ]
+	then	EDITOR=vi
+		echo $L
+		cat<<EOF;
+
+Please tell us about your machine.   There is a form we would like you
+to fill out that we will make available with the results.  If you would
+prefer to use a different editor, tell us the editor at the prompt.
+
+If you want to skip filling out this form (please don't) then answer 
+"none" at the prompt.
+
+EOF
+		echo $ECHON "Editor [default $EDITOR] $ECHOC"
+		read TMP
+		if [ X$TMP != X ]
+		then	EDITOR=$TMP
+		fi
+		if [ X$EDITOR != "none" ]
+		then	$EDITOR ../bin/$OS/`../scripts/info`
+		fi
+	fi
+fi
+
+echo $L
+echo ""
+echo "Confguration done, thanks."
+cat <<EOF
+
+There is a mailing list for discussing lmbench hosted at BitMover. 
+Send mail to majordomo@xxxxxxxxxxxx to join the list.
+
+EOF
+
+VERSION=`../scripts/version`
+
+C=../bin/$OS/`../scripts/config`
+echo DISKS=\"$DISKS\" > $C
+echo DISK_DESC=\"$DISK_DESC\" >> $C
+echo OUTPUT=$OUTPUT >> $C
+echo ENOUGH=$ENOUGH >> $C
+echo FASTMEM=\"$FASTMEM\" >> $C
+echo FILE=$FILE >> $C
+echo FSDIR=$FSDIR >> $C
+echo INFO=$INFO >> $C
+echo LINE_SIZE=$LINE_SIZE >> $C
+echo LOOP_O=$LOOP_O >> $C
+echo MAIL=$MAIL >> $C
+echo TOTAL_MEM=$TOTAL_MEM >> $C
+echo MB=$MB >> $C
+echo MHZ=\"$MHZ\" >> $C
+echo MOTHERBOARD=\"$MOTHERBOARD\" >> $C
+echo NETWORKS=\"$NETWORKS\" >> $C
+echo OS=\"$OS\" >> $C
+echo PROCESSORS=\"$PROCESSORS\" >> $C
+echo REMOTE=\"$REMOTE\" >> $C
+echo SLOWFS=\"$SLOWFS\" >> $C
+echo SYNC_MAX=\"$SYNC_MAX\" >> $C
+echo LMBENCH_SCHED=\"$LMBENCH_SCHED\" >> $C
+echo TIMING_O=$TIMING_O >> $C
+echo RSH=$RSH >> $C
+echo RCP=$RCP >> $C
+echo VERSION=$VERSION >> $C
+echo BENCHMARK_HARDWARE=$BENCHMARK_HARDWARE >> $C
+echo BENCHMARK_OS=$BENCHMARK_OS >> $C
+echo BENCHMARK_SYSCALL=$BENCHMARK_SYSCALL >> $C
+echo BENCHMARK_SELECT=$BENCHMARK_SELECT >> $C
+echo BENCHMARK_PROC=$BENCHMARK_PROC >> $C
+echo BENCHMARK_CTX=$BENCHMARK_CTX >> $C
+echo BENCHMARK_PAGEFAULT=$BENCHMARK_PAGEFAULT >> $C
+echo BENCHMARK_FILE=$BENCHMARK_FILE >> $C
+echo BENCHMARK_MMAP=$BENCHMARK_MMAP >> $C
+echo BENCHMARK_PIPE=$BENCHMARK_PIPE >> $C
+echo BENCHMARK_UNIX=$BENCHMARK_UNIX >> $C
+echo BENCHMARK_UDP=$BENCHMARK_UDP >> $C
+echo BENCHMARK_TCP=$BENCHMARK_TCP >> $C
+echo BENCHMARK_CONNECT=$BENCHMARK_CONNECT >> $C
+echo BENCHMARK_RPC=$BENCHMARK_RPC >> $C
+echo BENCHMARK_HTTP=$BENCHMARK_HTTP >> $C
+echo BENCHMARK_BCOPY=$BENCHMARK_BCOPY >> $C
+echo BENCHMARK_MEM=$BENCHMARK_MEM >> $C
+echo BENCHMARK_OPS=$BENCHMARK_OPS >> $C
+
+exit 0
diff --git a/performance/lmbench3/scripts/config-scaling b/performance/lmbench3/scripts/config-scaling
new file mode 100755
index 0000000..12e0f02
--- /dev/null
+++ b/performance/lmbench3/scripts/config-scaling
@@ -0,0 +1,160 @@
+#!/bin/sh
+
+# config-scaling - reconfigure just the scaling parameter SYNC_MAX
+#
+# Hacked by Carl Staelin (staelin@xxxxxxxxxx).
+# Copyright (c) 2002 Carl Staelin.  GPLed software.
+# $Id$
+
+# Make sure we can find: ./cmd, df, and netstat
+PATH=.:../../scripts:$PATH:/etc:/usr/etc:/sbin:/usr/sbin
+export PATH
+
+if [ ! -f $1 ]; then exit 1; fi
+
+. $1
+echo Using config in $1
+
+OLD_SYNC_MAX=$SYNC_MAX
+
+############################################################################
+# Multiple copies
+############################################################################
+echo $L
+cat<<EOF;
+
+If you are running on an MP machine and you want to try running
+multiple copies of lmbench in parallel, you can specify how many here.
+
+Using this option will make the benchmark run 100x slower (sorry).
+
+NOTE:  WARNING! This feature is experimental and many results are 
+	known to be incorrect or random!
+
+EOF
+AGAIN=Y
+while [ $AGAIN = Y ]
+do	echo $ECHON "MULTIPLE COPIES [default 1] $ECHOC"
+	read SYNC_MAX
+	if [ "X$SYNC_MAX" != X ]
+	then	case "$SYNC_MAX" in
+		[0-9]|[0-9][0-9]|[0-9][0-9][0-9])
+			AGAIN=N
+			;;
+		*)	echo "Please enter a number between 1 and 999"
+			;;
+		esac
+	else	AGAIN=N
+		SYNC_MAX=1
+	fi
+done
+
+if [ "X$LMBENCH_SCHED" = "X" ]
+then
+	LMBENCH_SCHED=DEFAULT
+	AGAIN=Y
+	while [ "$AGAIN" = "Y" ]
+	do	cat<<EOF
+Options to control job placement
+1) Allow scheduler to place jobs
+2) Assign each benchmark process with any attendent child processes
+   to its own processor
+3) Assign each benchmark process with any attendent child processes
+   to its own processor, except that it will be as far as possible
+   from other processes
+4) Assign each benchmark and attendent processes to their own
+   processors
+5) Assign each benchmark and attendent processes to their own
+   processors, except that they will be as far as possible from
+   each other and other processes
+6) Custom placement: you assign each benchmark process with attendent
+   child processes to processors
+7) Custom placement: you assign each benchmark and attendent
+   processes to processors
+
+Note: some benchmarks, such as bw_pipe, create attendent child
+processes for each benchmark process.  For example, bw_pipe
+needs a second process to send data down the pipe to be read
+by the benchmark process.  If you have three copies of the
+benchmark process running, then you actually have six processes;
+three attendent child processes sending data down the pipes and 
+three benchmark processes reading data and doing the measurements.
+
+EOF
+		echo $ECHON "Job placement selection: $ECHOC"
+		read LMBENCH_SCHED
+		AGAIN=N
+		case "$LMBENCH_SCHED" in
+		    1) LMBENCH_SCHED=DEFAULT;;
+		    2) LMBENCH_SCHED=BALANCED;;
+		    3) LMBENCH_SCHED=BALANCED_SPREAD;;
+		    4) LMBENCH_SCHED=UNIQUE;;
+		    5) LMBENCH_SCHED=UNIQUE_SPREAD;;
+		    6) echo $ECHON "Please enter a space-separated list of CPU ids: $ECHOC"
+		       read LMBENCH_SCHED
+		       LMBENCH_SCHED="CUSTOM $LMBENCH_SCHED"
+		       ;;
+		    7) echo $ECHON "Please enter a space-separated list of CPU ids: $ECHOC"
+		       read LMBENCH_SCHED
+		       LMBENCH_SCHED="CUSTOM_SPREAD $LMBENCH_SCHED"
+		       ;;
+		    *) AGAIN=Y
+		       ;;
+		esac
+	done
+fi
+
+if [ `expr $SYNC_MAX \* $MB` -gt `expr $TOTAL_MEM  / 2` ]
+then
+	MB=`expr $TOTAL_MEM / $SYNC_MAX`
+	MB=`expr $MB / 2`
+fi
+
+C=$1
+echo DISKS=\"$DISKS\" > $C
+echo DISK_DESC=\"$DISK_DESC\" >> $C
+echo OUTPUT=$OUTPUT >> $C
+echo ENOUGH=$ENOUGH >> $C
+echo FASTMEM=\"$FASTMEM\" >> $C
+echo FILE=$FILE >> $C
+echo FSDIR=$FSDIR >> $C
+echo INFO=$INFO >> $C
+echo LINE_SIZE=$LINE_SIZE >> $C
+echo LOOP_O=$LOOP_O >> $C
+echo MAIL=$MAIL >> $C
+echo TOTAL_MEM=$TOTAL_MEM >> $C
+echo MB=$MB >> $C
+echo MHZ=\"$MHZ\" >> $C
+echo MOTHERBOARD=\"$MOTHERBOARD\" >> $C
+echo NETWORKS=\"$NETWORKS\" >> $C
+echo OS=\"$OS\" >> $C
+echo PROCESSORS=\"$PROCESSORS\" >> $C
+echo REMOTE=\"$REMOTE\" >> $C
+echo SLOWFS=\"$SLOWFS\" >> $C
+echo SYNC_MAX=\"$SYNC_MAX\" >> $C
+echo LMBENCH_SCHED=\"$LMBENCH_SCHED\" >> $C
+echo TIMING_O=$TIMING_O >> $C
+echo RSH=$RSH >> $C
+echo RCP=$RCP >> $C
+echo VERSION=$VERSION >> $C
+echo BENCHMARK_HARDWARE=$BENCHMARK_HARDWARE >> $C
+echo BENCHMARK_OS=$BENCHMARK_OS >> $C
+echo BENCHMARK_SYSCALL=$BENCHMARK_SYSCALL >> $C
+echo BENCHMARK_SELECT=$BENCHMARK_SELECT >> $C
+echo BENCHMARK_PROC=$BENCHMARK_PROC >> $C
+echo BENCHMARK_CTX=$BENCHMARK_CTX >> $C
+echo BENCHMARK_PAGEFAULT=$BENCHMARK_PAGEFAULT >> $C
+echo BENCHMARK_FILE=$BENCHMARK_FILE >> $C
+echo BENCHMARK_MMAP=$BENCHMARK_MMAP >> $C
+echo BENCHMARK_PIPE=$BENCHMARK_PIPE >> $C
+echo BENCHMARK_UNIX=$BENCHMARK_UNIX >> $C
+echo BENCHMARK_UDP=$BENCHMARK_UDP >> $C
+echo BENCHMARK_TCP=$BENCHMARK_TCP >> $C
+echo BENCHMARK_CONNECT=$BENCHMARK_CONNECT >> $C
+echo BENCHMARK_RPC=$BENCHMARK_RPC >> $C
+echo BENCHMARK_HTTP=$BENCHMARK_HTTP >> $C
+echo BENCHMARK_BCOPY=$BENCHMARK_BCOPY >> $C
+echo BENCHMARK_MEM=$BENCHMARK_MEM >> $C
+echo BENCHMARK_OPS=$BENCHMARK_OPS >> $C
+
+exit 0
diff --git a/performance/lmbench3/scripts/depend b/performance/lmbench3/scripts/depend
new file mode 100755
index 0000000..30452ec
--- /dev/null
+++ b/performance/lmbench3/scripts/depend
@@ -0,0 +1,28 @@
+
+# Figure out dependencies for lmbench src.
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: depend 1.4 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+open(M, "Makefile");
+while(<M>) {
+	push(@Makefile, $_);
+	last if /^..MAKEDEPEND/;
+}
+close(M);
+open(G, "gcc -MM *.c | grep -v mhz.c | grep -v lat_ctx.c|");
+while (<G>) {
+	chop;
+	split(/:/);
+	$_[0] =~ s/\.o\s*$//;
+	push(@Makefile, "\$O/$_[0]: $_[1] \$O/lmbench.a\n");
+	push(@Makefile, "\t\$(COMPILE) -o \$O/$_[0] $_[0].c \$O/lmbench.a \$(LDLIBS)\n\n");
+}
+system "mv Makefile Makefile.old";
+open(M, ">Makefile");
+print M @Makefile;
+close(M);
+exit 0;
diff --git a/performance/lmbench3/scripts/do_ctx b/performance/lmbench3/scripts/do_ctx
new file mode 100755
index 0000000..002a6c2
--- /dev/null
+++ b/performance/lmbench3/scripts/do_ctx
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+# Make sure we can find: ./cmd, df, and netstat
+PATH=.:$PATH:/etc:/usr/etc:/sbin:/usr/sbin
+export PATH
+
+if [ X$MB = X ]
+then	MB=8
+fi
+AVAILKB=`expr $MB \* 1024`
+
+# Figure out how big we can go for stuff that wants to use
+# all and half of memory.
+HALF="512 1k 2k 4k 8k 16k 32k 64k 128k 256k 512k 1m"
+ALL="$HALF 2m"
+i=4
+while [ $i -le $MB ]
+do
+	ALL="$ALL ${i}m"
+	h=`expr $i / 2`
+	HALF="$HALF ${h}m"
+	i=`expr $i \* 2`
+done
+
+msleep 250
+if [ "X$CTX" = X ]
+then	CTX="0 4 8 16 32 64"
+fi
+if [ "X$N" = X ]
+then	N="2 4 8 16 24 32 64 96"
+fi
+for size in $CTX
+do	lat_ctx -s $size $N
+done
+exit 0
diff --git a/performance/lmbench3/scripts/getbg b/performance/lmbench3/scripts/getbg
new file mode 100755
index 0000000..24e49ad
--- /dev/null
+++ b/performance/lmbench3/scripts/getbg
@@ -0,0 +1,806 @@
+
+# Extract bargraph data from lmbench results.
+# Usage: getbg file file file....
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: getbg 1.18 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Sws $0 "$@"'
+	if 0;
+
+@bw_file = @file = @lat_ctx32_8 = @lat_ctx32 = @lat_ctx8 = @lat_ctx =
+@lat_shproc = @lat_simpleproc = @lat_nullproc =
+@lat_rpc_tcp_local = @lat_rpc_udp_local = @lat_tcp_local = @lat_udp_local =
+@lat_pipe = @lat_disk = @mhz = @lat_fs_delete = @lat_fs_create = 
+@lat_mappings = @lat_pagefault = @lat_connect = @lat_signal = @lat_sigaction =
+@lat_nullsys = @lat_mem = @lat_l2 = @lat_l1 = ();
+$nosort = $v = $paper = $slide = 0 if 0;
+$sortN = 0;
+$n = 0;
+foreach $file (@ARGV) {
+	warn "$0: doing $file\n" if $v;
+	open(FD, $file) || die "$0: can't open $file";
+	$file =~ s|/|-|;
+	$file =~ s/\.\d+//;
+	push(@file, $file);
+	while (<FD>) {
+		chop;
+		next if m|scripts/lmbench: /dev/tty|;
+		if (/^\[lmbench/) {
+			@_ = split;
+			if ($_[3] eq "SunOS") {
+				$_[3] .= "-$_[5]";
+			}
+			push(@uname, "@_");
+		}
+		if (/Mhz/) {
+			@_ = split;
+			push(@misc_mhz, $_[0]);
+		}
+		if (/^Null syscall:/) {
+			@_ = split;
+			push(@lat_nullsys, $_[2]);
+		}
+		if (/^Signal handler installation:/) {
+			@_ = split;
+			push(@lat_sigaction, $_[3]);
+		}
+		if (/^Signal handler overhead:/) {
+			@_ = split;
+			push(@lat_signal, $_[3]);
+		}
+		if (/^Pipe latency:/) {
+			@_ = split;
+			push(@lat_pipe, $_[2]);
+		}
+		if (/UDP latency using localhost:/) {
+			@_ = split;
+			push(@lat_udp_local, $_[4]);
+		}
+		if (/TCP latency using localhost/) {
+			@_ = split;
+			push(@lat_tcp_local, $_[4]);
+		}
+		if (/RPC.udp latency using localhost/) {
+			@_ = split;
+			push(@lat_rpc_udp_local, $_[4]);
+		}
+		if (/RPC.tcp latency using localhost/) {
+			@_ = split;
+			push(@lat_rpc_tcp_local, $_[4]);
+		}
+		if (/TCP\/IP connection cost to localhost/) {
+			@_ = split;
+			push(@lat_connect, $_[5]);
+		}
+		if (/^Process fork.exit/) {
+			@_ = split;
+			push(@lat_nullproc, $_[2]);
+		}
+		if (/^Process fork.execve:/) {
+			@_ = split;
+			push(@lat_simpleproc, $_[2]);
+		}
+		if (/^Process fork..bin.sh/) {
+			@_ = split;
+			push(@lat_shproc, $_[3]);
+		}
+		if (/^Pagefaults on/) {
+			@_ = split;
+			push(@lat_pagefault, $_[3]);
+		}
+		if (/size=0 ovr=/) {
+			while (<FD>) {
+				# Make sure we break out if no data here.
+				if (!/^[1-9]+\s/) {
+					warn "$file: No ctx found\n";
+					push(@lat_ctx, -1);
+				}
+				next unless /^2/;
+				@_ = split;
+				push(@lat_ctx, $_[1]);
+			    	last;
+			}
+			while (<FD>) {
+				# Make sure we break out if no data here.
+				if (!/^[1-9]+\s/) {
+					warn "$file: No ctx found\n";
+					push(@lat_ctx, -1);
+				}
+				next unless /^8/;
+				@_ = split;
+				push(@lat_ctx8, $_[1]);
+			    	last;
+			}
+		}
+		if (/size=32 ovr=/) {
+			while (<FD>) {
+				# Make sure we break out if no data here.
+				if (!/^[1-9]+\s/) {
+					warn "$file: No ctx found\n";
+					push(@lat_ctx32, -1);
+				}
+				next unless /^2/;
+				@_ = split;
+				push(@lat_ctx32, $_[1]);
+			    	last;
+			}
+			while (<FD>) {
+				# Make sure we break out if no data here.
+				if (!/^[1-9]+\s/) {
+					warn "$file: No ctx found\n";
+					push(@lat_ctx32_8, -1);
+				}
+				next unless /^8/;
+				@_ = split;
+				push(@lat_ctx32_8, $_[1]);
+			    	last;
+			}
+		}
+		if (/^Pipe bandwidth/) {
+			@_ = split;
+			push(@bw_pipe, $_[2]);
+		}
+		if (/^Socket bandwidth using localhost/) {
+			@_ = split;
+			push(@bw_tcp_local, $_[4]);
+		}
+		if (/^Disk .* latency:/) {
+			@_ = split;
+			push(@lat_disk, $_[3]);
+		}
+		if (/^File .* write bandwidth/) {
+			@_ = split;
+			$bw = sprintf("%.2f", $_[4] / 1024.);
+			push(@bw_file, $bw);
+		}
+		if (/^"mappings/) {
+			$value = &getbiggest("memory mapping timing");
+			push(@lat_mappings, $value);
+		}
+		if (/^"read bandwidth/) {
+			$value = &getbiggest("reread timing");
+			push(@bw_reread, $value);
+		}
+		if (/^"Mmap read bandwidth/) {
+			$value = &getbiggest("mmap reread timing");
+			push(@bw_mmap, $value);
+		}
+		if (/^"libc bcopy unaligned/) {
+			$value = &getbiggest("libc bcopy timing");
+			push(@bw_bcopy_libc, $value);
+		}
+		if (/^"unrolled bcopy unaligned/) {
+			$value = &getbiggest("unrolled bcopy timing");
+			push(@bw_bcopy_unrolled, $value);
+		}
+		if (/^Memory read/) {
+			$value = &getbiggest("memory read & sum timing");
+			push(@bw_mem_rdsum, $value);
+		}
+		if (/^Memory write/) {
+			$value = &getbiggest("memory write timing");
+			push(@bw_mem_wr, $value);
+		}
+		if (/^0k\s/) {
+			@_ = split; 
+			push(@lat_fs_create, int(1000000/$_[2]));
+			push(@lat_fs_delete, int(1000000/$_[3]));
+		}
+		if (/^"stride=128/) {
+			$save = -1;
+			while (<FD>) {
+				if (/^0.00098\s/) {
+					@_ = split;
+					push(@lat_l1, $_[1]);
+				} elsif (/^0.12500\s/) {
+					@_ = split;
+					push(@lat_l2, $_[1]);
+				} elsif (/^[45678].00000\s/) {
+					@_ = split;
+					$size = $_[0];
+					$save = $_[1];
+					last if /^8.00000\s/;
+				} elsif (/^\s*$/) {
+					last;
+				}
+			}
+			if (!/^8/) {
+				warn "$file: No 8MB memory latency, using $size\n";
+			}
+			push(@lat_mem, $save);
+		}
+	}
+	foreach $array (
+		'misc_mhz', 'lat_nullsys', 'lat_pipe', 'lat_udp_local',
+		'lat_tcp_local', 'lat_rpc_udp_local', 'lat_connect',
+		'lat_rpc_tcp_local', 'lat_nullproc', 'lat_simpleproc',
+		'lat_ctx', 'lat_ctx8', 'bw_pipe', 'bw_tcp_local',
+		'bw_file', 'lat_mappings', 'bw_reread', 'bw_mmap',
+		'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_mem_rdsum',
+		'bw_mem_wr', 'lat_l1', 'lat_l2', 'lat_mem', 'lat_disk',
+	) {
+		$last = eval '$#' . $array;
+		if ($last != $n) {
+			warn "No data for $array in $file\n";
+			eval 'push(@' . $array . ', -1);';
+		}
+	}
+	$n++;
+}
+
+if ($paper) {
+	&tbl("lat_nullsys", "usecs", "system call");
+	&tbl2("lat_signal", "lat_sigaction", "lat_signal", "usecs",
+	    "signal", "sigaction", "sig handler");
+	#&tbl("lat_nullproc", "msecs", "Process fork/exit time in milliseconds");
+	#&tbl("lat_simpleproc", "msecs", "Simple process create time in milliseconds");
+	#&tbl("lat_shproc", "msecs", "Process creates via /bin/sh time in milliseconds");
+	#&tbl2("lat_proc", "lat_simpleproc", "lat_shproc", "usecs",
+	#    "Process create time in milliseconds", "exec(2)", "/bin/sh -c");
+	&procs("lat_allproc", "lat_nullproc", "lat_simpleproc", "lat_shproc",
+	    "msecs");
+	&ctx;
+	&tbl("lat_pipe", "usecs", "Pipe latency");
+	&tbl("lat_connect", "usecs", "TCP connection");
+	&tbl2("lat_udp", "lat_udp_local", "lat_rpc_udp_local", "usecs",
+	    "UDP latency in \\(*mseconds", "UDP", "RPC/UDP");
+	&tbl2("lat_tcp", "lat_tcp_local", "lat_rpc_tcp_local", "usecs",
+	    "TCP latency in \\(*mseconds", "TCP", "RPC/TCP");
+	&tbl("lat_mappings", "usecs", "Memory mapping latency in \\(*mseconds");
+	&tbl("lat_pagefault", "usecs", "Pagefault latency in \\(*mseconds");
+	&tbl2("lat_fs", "lat_fs_create", "lat_fs_delete", "usecs",
+	    "File latency in milliseconds", "Create", "Delete");
+	&tbl("lat_disk", "usecs", "Disk latency");
+
+	&tbl("misc_mhz", "mhz", "Processor clock rate");
+	&tbl("bw_pipe", "MB", "Pipe bandwidth in MB / second");
+	&tbl("bw_tcp_local", "MB", "Local TCP socket bandwidth in MB / second");
+	&ipc;
+	&tbl("bw_file", "MB", "File write bandwidth in MB / second");
+	&tbl("bw_reread", "MB", "(Re)Read in MB / second");
+	&tbl("bw_mmap", "MB", "(Re)Read via mmap bandwidth in MB / second");
+	&read;
+	&tbl2("bw_bcopy", "bw_bcopy_unrolled", "bw_bcopy_libc", "MB",
+	"Bcopy bandwidth in MB / second", "Unrolled", "Libc");
+	&tbl("bw_mem_rdsum", "MB", "Memory read & sum bandwidth in MB / second");
+	&tbl("bw_mem_wr", "MB", "Memory write bandwidth in MB / second");
+	&mem;
+
+} else {
+	&bg("lat_nullsys", "usecs", "Number of null system calls per second");
+	&bg("lat_signal", "usecs", "Number of signal handlers per second");
+	&bg("lat_nullproc", "usecs", "Number of process forks/exits per second");
+	&bg("lat_simpleproc", "usecs", "Number of simple process creates per second");
+	&bg("lat_shproc", "usecs", "Number of simple process creates via /bin/sh per second");
+	&bg("lat_ctx", "usecs", "Number of context switches per second, 2 small processes");
+	&bg("lat_ctx8", "usecs", "Number of context switches per second, 8 small processes");
+
+	&bg("lat_pipe", "usecs", "Number of pipe transactions per second");
+	&bg("lat_connect", "usecs", "Number of local TCP socket connections per second");
+	&bg("lat_tcp_local", "usecs", "Number of local TCP socket transactions per second");
+	&bg("lat_udp_local", "usecs", "Number of local UDP socket transactions per second");
+	&bg("lat_rpc_udp_local", "usecs", 
+	    "Number of local RPC/UDP socket transactions per second");
+	&bg("lat_rpc_tcp_local", "usecs", 
+	    "Number of local RPC/TCP socket transactions per second");
+	&bg("lat_mappings", "usecs", "Number of memory mappings per second");
+	&bg("lat_pagefault", "usecs", "Number of pagefaults per second");
+	&bg("lat_fs_create", "usecs", "Number of file creates per second");
+
+	&bg("misc_mhz", "mhz", "Processor clock rate");
+	&bg("bw_pipe", "MB", "Pipe bandwidth in MB / second");
+	&bg("bw_tcp_local", "MB", "Local TCP socket bandwidth in MB / second");
+	&bg("bw_file", "MB", "File write bandwidth in MB / second");
+	&bg("bw_reread", "MB", "(Re)Read in MB / second");
+	&bg("bw_mmap", "MB", "(Re)Read via mmap bandwidth in MB / second");
+	&bg("bw_bcopy_libc", "MB", "Libc bcopy bandwidth in MB / second");
+	&bg("bw_bcopy_unrolled", "MB", "Unrolled bcopy bandwidth in MB / second");
+	&bg("bw_mem_rdsum", "MB", "Memory read & sum bandwidth in MB / second");
+	&bg("bw_mem_wr", "MB", "Memory write bandwidth in MB / second");
+}
+
+exit 0;
+
+# Input looks like
+# "benchmark name
+# size value
+# ....
+# <blank line>
+#
+# Return the biggest vvalue before the blank line.
+sub getbiggest
+{
+	local($msg) = @_;
+
+	undef $save;
+	$value = 0;
+	while (<FD>) {
+		last if /^\s*$/;
+		$save = $_ if /^\d\./;
+	}
+	if (defined $save) {
+		$_ = $save;
+		@d = split;
+		$value = $d[1];
+	} else {
+		warn "$file: no data for $msg\n";
+	}
+	$value;
+}
+
+
+sub bigger
+{
+	local($v1, $v2) = ($a, $b);
+
+	if ($sortN > 0) {
+		$v1 = (split(/\t/, $v1))[$sortN];
+		$v2 = (split(/\t/, $v2))[$sortN];
+	} else {
+		$v1 =~ s/.*\t//;
+		chop($v1);
+		$v2 =~ s/.*\t//;
+		chop($v2);
+	}
+	return ($v1 < $v2);
+}
+
+sub smaller
+{
+	local($v1, $v2) = ($a, $b);
+
+	if ($sortN > 0) {
+		$v1 = (split(/\t/, $v1))[$sortN];
+		$v2 = (split(/\t/, $v2))[$sortN];
+	} else {
+		$v1 =~ s/.*\t//;
+		chop($v1);
+		$v2 =~ s/.*\t//;
+		chop($v2);
+	}
+	$v1 =~ s/[^0-9]+//;
+	$v2 =~ s/[^0-9]+//;
+	return ($v1 > $v2);
+}
+
+sub tbl
+{
+	local($graph, $units, $title) = @_;
+	local(@values, @tmp,  $persec, $value);
+
+	warn "tmp/$graph.tbl\n" if $v;
+	open(FD, ">tmp/$graph.tbl");
+	print FD ".KS\n.TS\ncenter expand doublebox;\nl r.\nSystem\t$title\n=\n";
+	for ($i = 0; $i <= $#uname; $i++) {
+		@info = &getinfo($uname[$i], $misc_mhz[$i]);
+		$XXX = '$value = $'.$graph.'[$i];';
+		eval '$value = $'.$graph.'[$i];';
+		$value = sprintf("%.1f", $value / 1000) if ($units eq "msecs");
+		$value = sprintf("%.1f", $value) if ($units eq "MB");
+		next if (!defined $value || $value <= 0);
+		$_ = "$info[3] $info[$#info]";
+		&papernames;
+		push(@values, "$_\t$value\n");
+	}
+	@values = sort smaller @values unless ($nosort);
+	# Somewhere an extra space is getting added.
+	foreach $_ (@values) {
+		s/^\s*//;
+		print FD;
+	}
+	print FD ".TE\n.KE\n";
+	close(FD);
+}
+
+sub tbl2
+{
+	local($graph, $a, $b, $units, $title, $atitle, $btitle) = @_;
+	local(@values, @tmp,  $line, $persec, $value);
+
+	warn "tmp/$graph.tbl\n" if $v;
+	open(FD, ">tmp/$graph.tbl");
+	print FD ".KS\n.TS\nexpand doublebox;\nl c c\nl r r.\n";
+	print FD "System\t$atitle\t\\fB$btitle\\fP\n=\n";
+	for ($i = 0; $i <= $#uname; $i++) {
+		@info = &getinfo($uname[$i], $misc_mhz[$i]);
+		eval '$value = $'.$a.'[$i];';
+		next if (!defined $value || $value <= 0);
+		$value = sprintf("%.1f", $value / 1000) if ($units eq "msecs");
+		$value = sprintf("%.1f", $value) if ($units eq "MB");
+		$_ = "$info[3] $info[$#info]";
+		&papernames;
+		$line = "$_\t$value\t";
+		eval '$value = $'.$b.'[$i];';
+		$value = sprintf("%.1f", $value / 1000) if ($units eq "msecs");
+		$value = sprintf("%.1f", $value) if ($units eq "MB");
+		next if (!defined $value || $value <= 0);
+		$line .= "$value\n";
+		push(@values, $line);
+	}
+	unless ($nosort || $units eq "mhz") {
+		if ($units eq "MB") {
+			@values = sort bigger @values;
+		} else {
+			@values = sort smaller @values;
+		}
+	}
+	# Somewhere an extra space is getting added.
+	foreach $_ (@values) {
+		s/^\s*//;
+		print FD;
+	}
+	print FD ".TE\n.KE\n";
+	close(FD);
+}
+
+sub ipc
+{
+	local(@values, @tmp,  $line, $persec, $value);
+
+	open(FD, ">tmp/bw_ipc.tbl");
+	print FD ".KS\n.TS\nexpand doublebox;\nl c c c\nl r r r.\n";
+	print FD "System\tLibc bcopy\t\\fBpipe\\fP\tTCP\n=\n";
+	for ($i = 0; $i <= $#uname; $i++) {
+		@info = &getinfo($uname[$i], $misc_mhz[$i]);
+		$value = $bw_bcopy_libc[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$_ = "$info[3] $info[$#info]";
+		&papernames;
+		$line = "$_\t$value\t";
+		$value = $bw_pipe[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$line .= "$value\t";
+		$value = $bw_tcp_local[$i];
+		$value = sprintf("%.0f", $value);
+		# next if ($value <= 0);
+		$line .= "$value\\ \n";
+		push(@values, $line);
+	}
+	$sortN = 2;
+	@values = sort bigger @values unless ($nosort);
+	$sortN = 0;
+	# Somewhere an extra space is getting added.
+	foreach $_ (@values) {
+		s/^\s*//;
+		print FD;
+	}
+	print FD ".TE\n.KE\n";
+	close(FD);
+}
+	    
+sub read
+{
+	local(@values, @tmp,  $line, $persec, $value);
+
+	open(FD, ">tmp/bw_reread2.tbl");
+	print FD ".KS\n.TS\nexpand doublebox;\nc|c c|c c\nl|c c|c c\nl|r r|r r.\n";
+	print FD "\tLibc\t\\fBFile\\fP\tMemory\tFile\nSystem\tbcopy\t\\fBread\\fP\tread\tmmap\n=\n";
+	for ($i = 0; $i <= $#uname; $i++) {
+		@info = &getinfo($uname[$i], $misc_mhz[$i]);
+		$value = $bw_bcopy_libc[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$_ = "$info[3] $info[$#info]";
+		&papernames;
+		$line = "$_\t$value\t";
+		$value = $bw_reread[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$line .= "$value\t";
+		$value = $bw_mem_rdsum[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$line .= "$value\t";
+		$value = $bw_mmap[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$line .= "$value\\ \n";
+		push(@values, $line);
+	}
+	$sortN = 2;
+	@values = sort bigger @values unless ($nosort);
+	$sortN = 0;
+	# Somewhere an extra space is getting added.
+	foreach $_ (@values) {
+		s/^\s*//;
+		print FD;
+	}
+	print FD ".TE\n.KE\n";
+	close(FD);
+}
+
+sub mem
+{
+	local(@values, @tmp,  $line, $persec, $value);
+
+	open(FD, ">tmp/bw_allmem.tbl");
+	print FD ".KS\n.TS\nexpand doublebox;\nc|c s|c s\nl|c c|c c\nl|r r|r r.\n";
+	print FD "\tBcopy\tMemory\nSystem\t\\fBunrolled\\fP\tlibc\tread\twrite\n=\n";
+	for ($i = 0; $i <= $#uname; $i++) {
+		@info = &getinfo($uname[$i], $misc_mhz[$i]);
+		$value = $bw_bcopy_unrolled[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$_ = "$info[3] $info[$#info]";
+		&papernames;
+		$line = "$_\t$value\t";
+		$value = $bw_bcopy_libc[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$line .= "$value\t";
+		$value = $bw_mem_rdsum[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$line .= "$value\t";
+		
+		$value = $bw_mem_wr[$i];
+		$value = sprintf("%.0f", $value);
+		next if ($value <= 0);
+		$line .= "$value\\ \n";
+		push(@values, $line);
+	}
+	$sortN = 1;
+	@values = sort bigger @values unless ($nosort);
+	$sortN = 0;
+	# Somewhere an extra space is getting added.
+	foreach $_ (@values) {
+		s/^\s*//;
+		print FD;
+	}
+	print FD ".TE\n.KE\n";
+	close(FD);
+
+	@values = ();
+	open(FD, ">tmp/lat_allmem.tbl");
+	print FD ".KS\n.TS\nexpand doublebox;\nl c c c\nl c c c\nl r r r.\n";
+	print FD "\tLevel 1\tLevel 2\tMain\n";
+	print FD "System\tcache\tcache\tmemory\n=\n";
+	for ($i = 0; $i <= $#uname; $i++) {
+		@info = &getinfo($uname[$i], $misc_mhz[$i]);
+		$value = $lat_l1[$i];
+		next if ($value <= 0);
+		if (&same($lat_l1[$i], $lat_l2[$i])) {
+			$value = "--";
+		} 
+		$_ = "$info[3] $info[$#info]";
+		&papernames;
+		$line = "$_\t$value\t";
+		$value = $lat_l2[$i];
+		next if ($value <= 0);
+		if (!&same($lat_l1[$i], $lat_l2[$i]) &&
+		    &same($lat_l2[$i], $lat_mem[$i])) {
+			$value = "--";
+		}
+		$line .= "$value\t";
+		$value = $lat_mem[$i];
+		next if ($value <= 0);
+		$line .= "$value\\ \n";
+		push(@values, $line);
+	}
+
+	$sortN = 3;
+	@values = sort smaller @values unless ($nosort);
+	$sortN = 0;
+	# Somewhere an extra space is getting added.
+	foreach $_ (@values) {
+		s/^\s*//;
+		print FD;
+	}
+	print FD ".TE\n.KE\n";
+	close(FD);
+}
+
+sub procs
+{
+	local($graph, $a, $b, $c, $units) = @_;
+	local(@values, @tmp,  $line, $persec, $value);
+
+	warn "tmp/$graph.tbl\n" if $v;
+	open(FD, ">tmp/$graph.tbl");
+	print FD ".KS\n.TS\nexpand doublebox;\nl|c|c|c\nl|r|r|r.\n";
+	print FD "\tfork\t\\fBfork, exec\\fP\tfork, exec\n";
+	print FD "System\t& exit\t\\fB& exit\\fP\tsh -c & exit\n=\n";
+	for ($i = 0; $i <= $#uname; $i++) {
+		@info = &getinfo($uname[$i], $misc_mhz[$i]);
+		eval '$value = $'.$a.'[$i];';
+		$value = sprintf("%.1f", $value / 1000);
+		next if ($value <= 0);
+		$_ = "$info[3] $info[$#info]";
+		&papernames;
+		$line = "$_\t$value\t";
+		eval '$value = $'.$b.'[$i];';
+		$value = sprintf("%.0f", $value / 1000);
+		next if ($value <= 0);
+		$line .= "$value\\ \t";
+		eval '$value = $'.$c.'[$i];';
+		$value = sprintf("%.0f", $value / 1000);
+		next if ($value <= 0);
+		$line .= "$value\\ \n";
+		push(@values, $line);
+	}
+	$sortN = 2;
+	@values = sort smaller @values unless ($nosort);
+	$sortN = 0;
+	# Somewhere an extra space is getting added.
+	foreach $_ (@values) {
+		s/^\s*//;
+		print FD;
+	}
+	print FD ".TE\n.KE\n";
+	close(FD);
+}
+
+sub ctx
+{
+	local(@values, @tmp,  $line, $persec, $value);
+
+	open(FD, ">tmp/ctx.tbl");
+	print FD ".KS\n.TS\nexpand doublebox;\nc|c s|c s\nl|c c|c c\nl|r r|r r.\n";
+	print FD "\t2 processes\t8 processes\nSystem\t\\fB0KB\\fP\t32KB\t0KB\t32KB\n=\n";
+	for ($i = 0; $i <= $#uname; $i++) {
+		@info = &getinfo($uname[$i], $misc_mhz[$i]);
+		$_ = "$info[3] $info[$#info]";
+		&papernames;
+		$line = "$_\t";
+		foreach $a ('lat_ctx', 'lat_ctx32', 'lat_ctx8', 'lat_ctx32_8') {
+			eval '$value = $'.$a.'[$i];';
+			$line .= "$value\t";
+		}
+		chop($line);
+		push(@values, "$line\\ \n");
+	}
+	$sortN = 1;
+	@values = sort smaller @values unless ($nosort);
+	$sortN = 0;
+	# Somewhere an extra space is getting added.
+	foreach $_ (@values) {
+		s/^\s*//;
+		print FD;
+	}
+	print FD ".TE\n.KE\n";
+	close(FD);
+}
+
+sub papernames
+{
+	$_ = "IBM PowerPC" if /AIX powerpc\@134/;
+	$_ = "IBM Power2" if /AIX rs6000-990\@71/;
+	$_ = "FreeBSD/i586" if /FreeBSD i586\@13[01234]/;
+	$_ = "HP 9000/819" if /HP-UX 9000.819\@/;
+	$_ = "HP K210" if /HP-UX 9000.859\@/;
+	$_ = "SGI Challenge/R10K" if /IRIX.* IP25\@/;
+	$_ = "SGI Challenge/R4K" if /IRIX.* IP19\@/;
+	$_ = "SGI Indigo2" if /IRIX.* IP22\@/;
+	$_ = "Linux/Alpha" if /Linux alpha\@/;
+	$_ = "Linux/i686" if /Linux i686\@/;
+	$_ = "Linux/i586" if /Linux i586\@/;
+	$_ = "DEC Alpha\@150" if /OSF1 alpha\@147/;
+	$_ = "DEC Alpha\@300" if /OSF1 alpha\@303/;
+	$_ = "Sun SC1000" if /SunOS-5.5 sun4d\@5/;
+	$_ = "Sun Ultra1" if /SunOS-5.5 sun4u/;
+	$_ = "Solaris/i686" if /SunOS-5.5.1 i86pc\@13/;
+	$_ = "Unixware/i686" if /UNIX_SV x86at/;
+}
+
+sub bg
+{
+	local($graph, $units, $title) = @_;
+	local($persec, $value);
+
+	if ($nosort) {
+		open(FD, ">tmp/$graph.bg");
+	} else {
+		open(FD, "|sort -nr > tmp/$graph.bg");
+	}
+	for ($i = 0; $i <= $#uname; $i++) {
+		@info = &getinfo($uname[$i], $misc_mhz[$i]);
+#		eval "\$value = \$$graph[$i];";
+
+		$XXX = '$value = $'.$graph.'[$i];';
+		eval '$value = $'.$graph.'[$i];';
+		if ($uname[$i] =~ /IRIX/) {
+			$fill = " %%fill0";
+		} elsif ($uname[$i] =~ /HP/) {
+			$fill = " %%fill.3";
+		} elsif ($uname[$i] =~ /AIX/) {
+			$fill = " %%fill.1";
+		} elsif ($uname[$i] =~ /OSF/) {
+			$fill = " %%fill.5";
+		} elsif ($uname[$i] =~ /Linux/) {
+			$fill = " %%fill.7";
+		} elsif ($uname[$i] =~ /Sun/) {
+			$fill = " %%fill1";
+		} else {
+			$fill = "";
+		}
+		if ($units eq "usecs") {
+			if (!defined $value || $value <= 0) {
+				warn
+				"$ARGV[$i] $graph $info[$#info]: value is 0\n";
+				$persec = 0;
+				$value = 0;
+			} else {
+				$persec = 1000000 / $value;
+			}
+			if (0) {
+			printf FD
+			    "%.0f\t$info[3] $info[$#info] $value\\ $units$fill\n",
+			    $persec;
+			} else {
+			printf FD
+			    "%.0f\t%s %s $value\\ $units$fill\n",
+			    $persec, $file[$i], &getos($uname[$i]);
+			}
+		} elsif ($units eq "MB") {
+			printf FD "$value\t$info[3] $info[$#info]$fill\n";
+		} elsif ($units eq "mhz") {
+			printf FD "$value\t$info[3] $info[$#info]$fill\n";
+		} else {
+			die "Unknown units: $units";
+		}
+	}
+	if ($slide) {
+		print FD "%Title n $title\n";
+		print FD "%ps 12\n";
+		print FD "%ft HB\n";
+	} else {
+		print FD "%Title n $title\n";
+		print FD "%Title s lmbench v1.1\n";
+		print FD "%ps 16\n";
+		print FD "%ft R\n";
+	}
+	close(FD);
+}
+
+# Try and create sensible names from uname -a output
+sub getinfo
+{
+	local(@info);
+	local($name);
+	local($mhz) = $_[1];
+
+	$mhz =~ s/[\. ].*//;
+	@info = split(/\s+/, $_[0]);
+	$name = pop(@info);
+	chop($name);
+	if ($name eq "mips") {
+		$name = "$info[$#info]@$mhz";
+	} elsif ($_[0] =~ /HP-UX/) {
+		$name = "$info[7]@$mhz";
+	} elsif ($_[0] =~ /SunOS/) {
+		$name = "$info[7]@$mhz";
+	} elsif ($_[0] =~ /AIX/) {
+		$name = "$name@$mhz";
+	} else {
+		$name .= "@$mhz";
+	}
+	push(@info, $name);
+	@info;
+}
+
+# Return true if the values differe by less than 10%
+sub same
+{
+	local($a, $b) = @_;
+
+	if ($a > $b) {
+		$percent = (($a - $b) / $a) * 100;
+	} else {
+		$percent = (($b - $a) / $b) * 100;
+	}
+	return ($percent <= 20);
+}
+
+# Try and create sensible names from uname -a output
+sub getos
+{
+        local(@info);
+
+        @info = split(/\s+/, $_[0]);
+	$info[5] =~ s/-.*//;
+        "$info[3] $info[5]";
+}
+
diff --git a/performance/lmbench3/scripts/getbw b/performance/lmbench3/scripts/getbw
new file mode 100755
index 0000000..27b182b
--- /dev/null
+++ b/performance/lmbench3/scripts/getbw
@@ -0,0 +1,260 @@
+
+# Extract the bandwith information.
+# Usage: getbw file file....
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: getbw 1.6 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+#
+# Default is file bandwidth which lists: mem read, file read (both),
+# mmap read (both), bcopy.
+#
+# -mem turns off the file stuff but turns on rd, wr, rdwr, frd, fwr, 
+# bcopy, bzero, cp, fcp.
+#
+foreach $file (@ARGV) {
+	open(FD, $file);
+	&cache;
+	open(FD, $file);
+	($f = $file) =~ s|/|-|;
+	if ($mem || $all) {
+		print "tmp/bwmem.$f\n";
+		open(OUT, ">tmp/bwmem.$f");
+	} else {
+		print "tmp/bwfile.$f\n";
+		open(OUT, ">tmp/bwfile.$f");
+	}
+	print OUT "%X Memory size \n%Y Bandwidth in MB/sec\n";
+	while (<FD>) {
+		chop;
+		if (/^\[lmbench/) {
+			@_ = split;
+			if ($_[3] eq "SunOS") {
+				$_[3] .= "-$_[5]";
+			}
+			$uname = "@_";
+		}
+		if (/^\d+.*Mhz/) {
+			@_ = split;
+			$mhz = $_[0];
+			$tmp = &getinfo("$uname", $mhz);
+			if ($mem) {
+				print OUT "%T Memory bandwidth for $tmp\n";
+			} else {
+				print OUT "%T Reread bandwidth for $tmp\n";
+			}
+		}
+		if (/MHZ/) {
+			@_ = split;
+			$mhz = $_[1];
+			chop($mhz) if $mhz =~ /]$/;
+			$tmp = &getinfo("$uname", $mhz);
+			if ($mem) {
+				print OUT "%T Memory bandwidth for $tmp\n";
+			} else {
+				print OUT "%T Reread bandwidth for $tmp\n";
+			}
+		}
+		if ((!$all && !$mem) && /^"read bandwidth/) {
+			print OUT "\"File reread\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if ((!$all && !$mem) && /^"read open2close bandwidth/) {
+			print OUT "\"File open2close reread\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if ((!$all && !$mem) && /^"Mmap read bandwidth/) {
+			print OUT "\"File mmap reread\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if ((!$all && !$mem) && /^"Mmap read open2close bandwidth/) {
+			print OUT "\"File mmap open2close reread\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if ($all && /^"libc bcopy aligned/) {
+			print OUT "\"libc bcopy aligned\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if (/^"libc bcopy unaligned/) {
+			print OUT "\"libc bcopy unaligned\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if ($all && /^"unrolled bcopy aligned/) {
+			print OUT "\"libc bcopy unaligned\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if (($all || $mem) && /^"unrolled bcopy unaligned/) {
+			print OUT "\"unrolled bcopy unaligned\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if (($all || $mem) && /^"unrolled partial bcopy unaligned/) {
+			print OUT "\"unrolled partial bcopy unaligned\n";
+			while (<FD>) {
+				last if /^\s*$/;
+				@_ = split; next unless $_[0] > $cache;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if (/^Memory read bandwidth/) {
+			print OUT "\"$_\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if (($all || $mem) && /^Memory partial read bandwidth/) {
+			print OUT "\"$_\n";
+			while (<FD>) {
+				last if /^\s*$/;
+				@_ = split; next unless $_[0] > $cache;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if (($all || $mem) && /^Memory partial read.write bandwidth/) {
+			print OUT "\"$_\n";
+			while (<FD>) {
+				last if /^\s*$/;
+				@_ = split; next unless $_[0] > $cache;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if (($all || $mem) && /^Memory partial write bandwidth/) {
+			print OUT "\"$_\n";
+			while (<FD>) {
+				last if /^\s*$/;
+				@_ = split; next unless $_[0] > $cache;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if (($all || $mem) && /^Memory write bandwidth/) {
+			print OUT "\"$_\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+		if (($all || $mem) && /^Memory bzero bandwidth/) {
+			print OUT "\"$_\n";
+			while (<FD>) {
+				last if /^\s*$/;
+			    	print OUT;
+			}
+			print OUT "\n";
+			next;
+		}
+	}
+}
+
+# Paw through the data and figure out how big the L1 cache is.
+# We look at the memory read performance and look for cluster breaks
+# at 4, 8, 16, 32, 64, 126, and 256k.
+sub cache
+{
+	local($in) = 0;
+	local($n, $sum, $avg) = (0,0,0);
+
+	$cache = 0;
+	while (<FD>) {
+		if (/^Memory partial read bandwidth/) {
+			$in = 1;
+			next;
+		}
+		next unless $in;
+		@_ = split;
+		if ($n == 0) {
+			$sum += $_[1];
+			$n++;
+			next;
+		}
+		$avg = $sum/$n;
+		if ($_[1] < .75*$avg) {
+			$cache = $last;
+			return;
+		}
+		$last = $_[0];
+		$sum += $_[1];
+		$n++;
+	}
+}
+
+# Try and create sensible names from uname -a output
+sub getinfo
+{
+	local(@info);
+	local($name);
+	local($mhz);
+
+	$mhz = $_[1];
+	$_ = $_[0];
+	@info = split;
+	$name = pop(@info);
+	chop($name);
+	if ($name eq "unknown") {
+		$name = pop(@info);
+	}
+	if ($name eq "mips") {
+		$name = "$info[$#info]\@$mhz";
+	} elsif ($_[0] =~ /HP-UX/) {
+		$name = "$info[7]\@$mhz";
+	} elsif ($_[0] =~ /SunOS/) {
+		$name = "$info[7]\@$mhz";
+	} else {
+		$name .= "\@$mhz";
+	}
+	"$info[3] $name";
+}
diff --git a/performance/lmbench3/scripts/getctx b/performance/lmbench3/scripts/getctx
new file mode 100755
index 0000000..da7d645
--- /dev/null
+++ b/performance/lmbench3/scripts/getctx
@@ -0,0 +1,79 @@
+
+# Extract the context switching information from lmbench result files.
+# Usage: getctx file file....
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: getctx 1.8 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ss $0 "$@"'
+	if 0;
+
+$title = "foo" if 0;
+
+foreach $file (@ARGV) {
+	open(FD, $file);
+	while (<FD>) {
+		chop;
+		if (/^\[lmbench/) {
+			@_ = split;
+			if ($_[3] eq "SunOS") {
+				$_[3] .= "-$_[5]";
+			}
+			$uname = "@_";
+		}
+		if (/Mhz/) {
+			$mhz = $_;
+		}
+		if (/^.size=/) {
+			s/size/Process size/;
+			s/ ovr/\toverhead/;
+			@info = &getinfo($uname, $mhz);
+			($f = $file) =~ s|/|-|;
+			print "tmp/ctx.$f\n";
+			open(OUT, ">tmp/ctx.$f");
+			print OUT "\"%X Processes \n\"%Y Time in microseconds\n";
+			if ($title) {
+				print OUT "%T $f\n";
+			} else {
+				print OUT
+				    "\"%T Context switches for " . 
+				    "$info[3] $info[$#info]Mhz\n";
+			}
+			print OUT "$_\n";
+			while (<FD>) {
+				last if /^Null/ || /^Pipe/ || /^Memor/;
+				next if /\$Id/;
+				next if m|scripts/lmbench: /dev/tty|;
+				s/ ovr/\toverhead/;
+				s/size/Process size/;
+			    	print OUT;
+			}
+			close(OUT);
+			last;
+		}
+	}
+}
+
+# Try and create sensible names from uname -a output
+sub getinfo
+{
+	local(@info);
+	local($name);
+	local($mhz);
+
+	($mhz = $_[1]) =~ s/[\. ].*//;
+	@info = split(/\s+/, $_[0]);
+	$name = pop(@info);
+	chop($name);
+	if ($name eq "mips") {
+		$name = "$info[$#info]@$mhz";
+	} elsif ($_[0] =~ /HP-UX/) {
+		$name = "$info[7]@$mhz";
+	} elsif ($_[0] =~ /SunOS/) {
+		$name = "$info[7]@$mhz";
+	} else {
+		$name .= "@$mhz";
+	}
+	push(@info, $name);
+	@info;
+}
diff --git a/performance/lmbench3/scripts/getdisk b/performance/lmbench3/scripts/getdisk
new file mode 100755
index 0000000..3d0199b
--- /dev/null
+++ b/performance/lmbench3/scripts/getdisk
@@ -0,0 +1,69 @@
+
+# Extract the disk graph data from lmbench result files.
+# 
+# Hacked into existence by Larry McVoy 
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: getdisk 1.2 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+foreach $file (@ARGV) {
+	open(FD, $file);
+	$file =~ s|/|-|;
+	while (<FD>) {
+		next unless /DISK_DESC/;
+		s/.DISK_DESC: //;
+		chop; chop; chop;
+		@_ = split(/[\[\]]/, $_);
+		foreach $_ (@_) {
+			next unless /:/;
+			@foo = split(/:/, $_);
+			$foo[0] =~ s|/dev/||;
+			$disks{$foo[0]} = $foo[1];
+		}
+		last;
+	}
+	while (<FD>) {
+		if (/^"Seek times for \/dev\/(.*)$/) {
+			$ok = 0;
+			foreach $key (keys %disks) {
+				next unless $key eq $1;
+				$ok = 1;
+			}
+			if ($ok != 1) {
+				die "Disk results are screwed up, no $1.\n";
+			}
+			print "tmp/seek_$1.$file\n";
+			open(OUT, ">tmp/seek_$1.$file");
+			print OUT "%T Seek times for $disks{$1}\n";
+			print OUT "%X Seek distance (MB)\n";
+			print OUT "%Y Time in millisec\n";
+			while (<FD>) {
+				last unless /^\d/;
+				print OUT;
+			}
+			close(OUT);
+		}
+		if (/^"Zone bandwidth for \/dev\/(.*)$/) {
+			$ok = 0;
+			foreach $key (keys %disks) {
+				next unless $key eq $1;
+				$ok = 1;
+			}
+			if ($ok != 1) {
+				die "Disk results are screwed up, no $1.\n";
+			}
+			print  "tmp/zone_$1.$file\n";
+			open(OUT, ">tmp/zone_$1.$file");
+			print OUT "%T Zone bandwidths for $disks{$1}\n";
+			print OUT "%X Disk offset (MB)\n";
+			print OUT "%Y Bandwidth (MB/sec)\n";
+			while (<FD>) {
+				last unless /^\d/;
+				print OUT;
+			}
+			close(OUT);
+		}
+	}
+}
+exit 0;
diff --git a/performance/lmbench3/scripts/getlist b/performance/lmbench3/scripts/getlist
new file mode 100755
index 0000000..8c35970
--- /dev/null
+++ b/performance/lmbench3/scripts/getlist
@@ -0,0 +1,31 @@
+
+# Find everything in my results directory that looks like lmbench output.
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxxxxxxx)
+# Copyright (c) 1994-1998 Larry McVoy.  
+# $Id$
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+$LIST = "no such file";
+$LIST = "LIST" if (-f "LIST");
+$LIST = $ARGV[0] if (($#ARGV == 0) && (-f $ARGV[0]));
+if (-f $LIST) {
+	open(L, $LIST);
+	$_ = <L>;
+	chop;
+	@files = split;
+	close(L);
+} else {
+	@files = <*/*>;
+}
+foreach $file (@files) {
+	next if $file =~ /\.INFO$/;
+	open(FD, $file) || next;
+	next unless defined($_ = <FD>);
+	close(FD);
+	next unless /^\[lmbench3.[01]/;
+	print "$file ";
+}
+print "\n";
+exit 0;
diff --git a/performance/lmbench3/scripts/getmax b/performance/lmbench3/scripts/getmax
new file mode 100755
index 0000000..754b50c
--- /dev/null
+++ b/performance/lmbench3/scripts/getmax
@@ -0,0 +1,73 @@
+
+# Look at a bunch of bargraph files and figure out the max amongst them all.
+# Usage: getmax file file file....
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: getmax 1.10 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+$graph = 1 if 0;
+$exit = 1;
+foreach $file (@ARGV) {
+	$exit = 0 if -f $file;
+}
+exit $exit if $noop;
+
+$noop = 1 if 0;
+$max_X = $max_Y = -1000000000;
+$min_X = $min_Y = 1000000000;
+foreach $file (@ARGV) {
+	next if $rmmax;
+	unless (open(FD, $file)) {
+		warn "Can't open $file\n";
+		next;
+	}
+	while (<FD>) {
+		next if /^"/;
+		next if /^%/;
+		next if /^\s*$/;
+		next if m|scripts/lmbench: /dev/tty|;
+		@_ = split;
+		$min_X = $_[0] if ($_[0] < $min_X);
+		$min_Y = $_[1] if ($_[1] < $min_Y);
+		$max_X = $_[0] if ($_[0] > $max_X);
+		$max_Y = $_[1] if ($_[1] > $max_Y);
+	}
+	close(FD);
+}
+$half = 0 if 0;	# lint
+$max_X /= 2 if ($half);
+foreach $file (@ARGV) {
+	unless (open(FD, $file)) {
+		warn "Can't open $file\n";
+		next;
+	}
+	@lines = <FD>;
+	open(FD, ">$file") || die "Can't open $file\n";
+	if ($graph) {
+		print FD "%fakemin-X $min_X\n";
+		print FD "%fakemin-Y $min_Y\n";
+		print FD "%fakemax-X $max_X\n";
+		print FD "%fakemax-Y $max_Y\n";
+		foreach $_ (@lines) {
+			next if /^%fakem/;
+			print FD;
+		}
+		warn "Max X is $max_X\n" if $v;
+		warn "Max Y is $max_Y\n" if $v;
+	} elsif ($rmmax) {
+		foreach $_ (@lines) {
+			next if /^%fakem/;
+			print FD;
+		}
+	} else {
+		print FD @lines;
+		print FD "%fakemax $max_X\n";
+		warn "Max X is $max_X\n" if $v;
+	}
+	close(FD);
+}
+exit $exit;
diff --git a/performance/lmbench3/scripts/getmem b/performance/lmbench3/scripts/getmem
new file mode 100755
index 0000000..d3ea7ac
--- /dev/null
+++ b/performance/lmbench3/scripts/getmem
@@ -0,0 +1,69 @@
+
+# Extract the memory latency graph data from lmbench result files.
+# 
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: getmem 1.7 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ss $0 "$@"'
+	if 0;
+
+foreach $file (@ARGV) {
+	open(FD, $file);
+	$file =~ s|/|-|;
+	while (<FD>) {
+		chop;
+		next if m|scripts/lmbench: /dev/tty|;
+		if (/^\[lmbench/) {
+			@_ = split;
+			if ($_[3] eq "SunOS") {
+				$_[3] .= "-$_[5]";
+			}
+			$uname = "@_";
+		}
+		if (/Mhz/) {
+			$mhz = $_;
+		}
+		if (/^Memory load latency/) {
+			@info = &getinfo($uname, $mhz);
+			($f = $file) =~ s|.*/||;
+			print "tmp/mem.$f\n";
+			open(OUT, ">tmp/mem.$f");
+			print OUT "\"%X Array size\n\"%Y Latency in nanoseconds\n";
+			print OUT
+			    "\"%T $file $info[3] $info[$#info] memory latencies\n";
+			while (<FD>) {
+				next if /\$Id/;
+				next if /^\[/;
+			    	print OUT;
+			}
+			close(OUT);
+			last;
+		}
+	}
+}
+exit 0;
+
+# Try and create sensible names from uname -a output
+sub getinfo
+{
+	local(@info);
+	local($name);
+	local($mhz) = $_[1];
+
+	$mhz =~ s/\..*//;
+	$mhz =~ s/ .*//;
+	@info = split(/\s+/, $_[0]);
+	$name = pop(@info);
+	chop($name);
+	if ($name eq "mips") {
+		$name = "$info[$#info]@$mhz";
+	} elsif ($_[0] =~ /HP-UX/) {
+		$name = "$info[7]@$mhz";
+	} elsif ($_[0] =~ /SunOS/) {
+		$name = "$info[7]@$mhz";
+	} else {
+		$name .= "@$mhz";
+	}
+	push(@info, $name);
+	@info;
+}
diff --git a/performance/lmbench3/scripts/getpercent b/performance/lmbench3/scripts/getpercent
new file mode 100755
index 0000000..6ede4c2
--- /dev/null
+++ b/performance/lmbench3/scripts/getpercent
@@ -0,0 +1,400 @@
+
+# Generate an ascii percentage summary from lmbench result files.
+# Usage: getpercent file file file...
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: getpercent 1.9 00/01/31 15:29:41-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+$n = 0;	# apparently, hpux doesn't init to 0????
+
+foreach $file (@ARGV) {
+	push(@files, $file);
+	open(FD, $file) || die "$0: can't open $file";
+	$file =~ s|/|-|;
+	$file =~ s/\.\d+//;
+	push(@file, $file);
+	while (<FD>) {
+		chop;
+		next if m|scripts/lmbench: /dev/tty|;
+		if (/^\[lmbench/) {
+			split;
+			push(@uname, "@_");
+		}
+		if (/Mhz/) {
+			split;
+			push(@misc_mhz, $_[0]);
+		}
+		if (/^Null syscall:/) {
+			split;
+			push(@lat_nullsys, $_[2]);
+		}
+		if (/^Pipe latency:/) {
+			split;
+			push(@lat_pipe, $_[2]);
+		}
+		if (/UDP latency using localhost:/) {
+			split;
+			push(@lat_udp_local, $_[4]);
+		}
+		if (/TCP latency using localhost/) {
+			split;
+			push(@lat_tcp_local, $_[4]);
+		}
+		if (/RPC.udp latency using localhost/) {
+			split;
+			push(@lat_rpc_udp_local, $_[4]);
+		}
+		if (/RPC.tcp latency using localhost/) {
+			split;
+			push(@lat_rpc_tcp_local, $_[4]);
+		}
+		if (/^Process fork.exit/) {
+			split;
+			push(@lat_nullproc, $_[2]);
+		}
+		if (/^Process fork.execve:/) {
+			split;
+			push(@lat_simpleproc, $_[2]);
+		}
+		if (/^Process fork..bin.sh/) {
+			split;
+			push(@lat_shproc, $_[3]);
+		}
+		if (/size=0 ovr=/) {
+			while (<FD>) {
+				next unless /^2/;
+				split;
+				push(@lat_ctx, $_[1]);
+			    	last;
+			}
+			while (<FD>) {
+				next unless /^8/;
+				split;
+				push(@lat_ctx8, $_[1]);
+			    	last;
+			}
+		}
+		if (/^Pipe bandwidth/) {
+			split;
+			push(@bw_pipe, $_[2]);
+		}
+		if (/^Socket bandwidth using localhost/) {
+			split;
+			push(@bw_tcp_local, $_[4]);
+		}
+		if (/^File .* write bandwidth/) {
+			split;
+			$bw = sprintf("%.2f", $_[4] / 1024.);
+			push(@bw_file, $bw);
+		}
+		if (/^"mappings/) {
+			$value = &getbiggest("memory mapping timing");
+			push(@lat_mappings, $value);
+		}
+		if (/^"read bandwidth/) {
+			$value = &getbiggest("reread timing");
+			push(@bw_reread, $value);
+		}
+		if (/^"Mmap read bandwidth/) {
+			$value = &getbiggest("mmap reread timing");
+			push(@bw_mmap, $value);
+		}
+		if (/^"libc bcopy unaligned/) {
+			$value = &getbiggest("libc bcopy timing");
+			push(@bw_bcopy_libc, $value);
+		}
+		if (/^"unrolled bcopy unaligned/) {
+			$value = &getbiggest("unrolled bcopy timing");
+			push(@bw_bcopy_unrolled, $value);
+		}
+		if (/^Memory read/) {
+			$value = &getbiggest("memory read & sum timing");
+			push(@bw_mem_rdsum, $value);
+		}
+		if (/^Memory write/) {
+			$value = &getbiggest("memory write timing");
+			push(@bw_mem_wr, $value);
+		}
+		if (/^"stride=128/) {
+			$save = -1;
+			while (<FD>) {
+				if (/^0.00098\s/) {
+					split;
+					push(@lat_l1, $_[1]);
+				} elsif (/^0.12500\s/) {
+					split;
+					push(@lat_l2, $_[1]);
+				} elsif (/^[45678].00000\s/) {
+					split;
+					$size = $_[0];
+					$save = $_[1];
+					last if /^8.00000\s/;
+				} elsif (/^\s*$/) {
+					last;
+				}
+			}
+			if (!/^8/) {
+				warn "$file: No 8MB memory latency, using $size\n";
+			}
+			push(@lat_mem, $save);
+		}
+		if (/^"stride=8192/) {	# XXX assumes <= 8K pagesize
+			$tbl = -1;
+			while (<FD>) {
+				if (/^[45678].00000\s/) {
+					split;
+					$tlb = $_[1];
+					$size = $_[0];
+					last if /^8.00000\s/;
+				}
+			}
+			if (!/^8/) {
+				warn "$file: No 8MB tlb latency, using $size\n";
+			}
+			push(@lat_tlb, $tlb);
+		}
+	}
+	foreach $array (
+		'misc_mhz', 'lat_nullsys', 'lat_pipe', 'lat_udp_local',
+		'lat_tcp_local', 'lat_rpc_udp_local',
+		'lat_rpc_tcp_local', 'lat_nullproc', 'lat_simpleproc',
+		'lat_ctx', 'lat_ctx8', 'bw_pipe', 'bw_tcp_local',
+		'bw_file', 'lat_mappings', 'bw_reread', 'bw_mmap',
+		'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_mem_rdsum',
+		'bw_mem_wr', 'lat_l1', 'lat_l2', 'lat_mem', 'lat_tlb',
+	) {
+		eval "if (\$#$array != $n) {
+			warn \"No data for $array in $file\n\";
+			push(\@$array, -1);
+		    }";
+	}
+	$n++;
+}
+exit 0;
+
+# Input looks like
+# "benchmark name
+# size value
+# ....
+# <blank line>
+#
+# Return the biggest vvalue before the blank line.
+sub getbiggest
+{
+	local($msg) = @_;
+
+	undef $save;
+	$value = 0;
+	while (<FD>) {
+		last if /^\s*$/;
+		$save = $_ if /^\d\./;
+	}
+	if (defined $save) {
+		$_ = $save;
+		@d = split;
+		$value = $d[1];
+		if (int($d[0]) < 8) {
+			warn "$file: using $d[0] size for $msg\n";
+		}
+	} else {
+		warn "$file: no data for $msg\n";
+	}
+	$value;
+}
+
+
+print<<EOF;
+
+                L M B E N C H  1 . 0   S U M M A R Y
+                ------------------------------------
+
+                  Comparison to best of the breed
+                  -------------------------------
+
+		(Best numbers are starred, i.e., *123)
+
+
+        Processor, Processes - factor slower than the best
+        --------------------------------------------------
+Host                 OS  Mhz    Null    Null  Simple /bin/sh Mmap 2-proc 8-proc
+                             Syscall Process Process Process  lat  ctxsw  ctxsw
+--------- ------------- ---- ------- ------- ------- ------- ---- ------ ------
+EOF
+
+for ($i = 0; $i <= $#uname; $i++) {
+        printf "%-9.9s %13.13s ", $file[$i], &getos($uname[$i]);
+            printf "%4.0f %7s %7s %7s %7s %4s %6s %6s\n",
+            $misc_mhz[$i],
+            &smaller(@lat_nullsys, $i, 0),
+            &smaller(@lat_nullproc, $i, 1024),
+            &smaller(@lat_simpleproc, $i, 1024),
+            &smaller(@lat_shproc, $i, 1024),
+            &smaller(@lat_mappings, $i, 0),
+            &smaller(@lat_ctx, $i, 0),
+            &smaller(@lat_ctx8, $i, 0);
+
+}
+
+print<<EOF;
+
+        *Local* Communication latencies - factor slower than the best
+        -------------------------------------------------------------
+Host                 OS  Pipe       UDP    RPC/     TCP    RPC/
+                                            UDP             TCP
+--------- ------------- ------- ------- ------- ------- -------
+EOF
+
+for ($i = 0; $i <= $#uname; $i++) {
+        printf "%-9.9s %13.13s ", $file[$i], &getos($uname[$i]);
+        printf "%7s %7s %7s %7s %7s\n",
+            &smaller(@lat_pipe, $i, 0),
+            &smaller(@lat_udp_local, $i, 0),
+            &smaller(@lat_rpc_udp_local, $i, 0),
+            &smaller(@lat_tcp_local, $i, 0),
+            &smaller(@lat_rpc_tcp_local, $i, 0);
+
+}
+
+print<<EOF;
+
+        *Local* Communication bandwidths - percentage of the best
+        ---------------------------------------------------------
+Host                 OS Pipe  TCP  File   Mmap  Bcopy  Bcopy  Mem   Mem
+                                  reread reread (libc) (hand) read write
+--------- ------------- ---- ---- ------ ------ ------ ------ ---- -----
+EOF
+
+for ($i = 0; $i <= $#uname; $i++) {
+        printf "%-9.9s %13.13s ", $file[$i], &getos($uname[$i]);
+        printf "%4s %4s %6s %6s %6s %6s %4s %5s\n",
+            &bigger(@bw_pipe, $i),
+            &bigger(@bw_tcp_local, $i),
+            &bigger(@bw_reread, $i),
+            &bigger(@bw_mmap, $i),
+            &bigger(@bw_bcopy_libc, $i),
+            &bigger(@bw_bcopy_unrolled, $i),
+            &bigger(@bw_mem_rdsum, $i),
+            &bigger(@bw_mem_wr, $i);
+}
+
+print<<EOF;
+
+            Memory latencies in nanoseconds - factor slower than the best
+		    (WARNING - may not be correct, check graphs)
+            -------------------------------------------------------------
+Host                 OS   Mhz  L1 \$   L2 \$    Main mem    Guesses
+--------- -------------   ---  ----   ----    --------    -------
+EOF
+
+for ($i = 0; $i <= $#uname; $i++) {
+        printf "%-9.9s %13.13s   %3d",
+	    $file[$i], &getos($uname[$i]), $misc_mhz[$i];
+	if ($lat_l1[$i] < 0) {
+        	printf "%6s %6s %11s    %s",
+		    "-", "-", "-",
+		    "Bad mhz?";
+	} else {
+		$msg = &check_caches;
+		if ($msg =~ /L1/) {
+			$lat_l1[$i] = -1;
+		} elsif ($msg =~ /L2/) {
+			$lat_l2[$i] = -1;
+		}
+        	printf "%6s %6s %11s",
+		    &smaller(@lat_l1, $i, 0),
+		    &smaller(@lat_l2, $i, 0), 
+		    &smaller(@lat_mem, $i, 0);
+		if ($msg =~ /L/) {
+			print "$msg";
+		}
+	}
+	print "\n";
+}
+
+
+exit 0;
+
+# Return factor of the smallest number.
+sub smaller
+{
+        local(@values) = @_;
+        local($which, $min, $i, $units);
+
+        $units = pop(@values);
+        $which = pop(@values);
+        $min = 0x7fffffff;
+        foreach $i (@values) {
+		next if $i == -1 || $i == 0;
+                $min = $i if ($min > $i);
+        }
+        if ($values[$which] == $min) {
+                #"***";
+		if ($units == 1024) {
+			sprintf("*%.1fK", $values[$which]/1024.);
+		} else {
+			sprintf("*%d", $values[$which]);
+		}
+        } elsif ($values[$which] == -1) {
+		"???";
+        } elsif ($values[$which] == 0) {
+		"???";
+        } elsif ($values[$which] / $min < 10.0) {
+                sprintf("%.1f", $values[$which] / $min);
+        } else {
+                sprintf("%.0f", $values[$which] / $min);
+        }
+}
+
+# Return closeness to the largest number as a percentage.
+# Exact match is 100%, smaller numbers are like 15%.
+sub bigger
+{
+        local(@values) = @_;
+        local($which, $max, $i);
+
+        $which = pop(@values);
+        $max = 0;
+        foreach $i (@values) {
+                $max = $i if ($max < $i);
+        }
+        if ($values[$which] == $max) {
+                sprintf("*%d", $values[$which]);
+        } else {
+                sprintf("%d%%", $values[$which] / $max * 100);
+        }
+}
+
+# Try and create sensible names from uname -a output
+sub getos
+{
+        local(@info);
+
+        @info = split(/\s+/, $_[0]);
+        "$info[3] $info[5]";
+}
+
+# Return true if the values differe by less than 10%
+sub same
+{
+	local($a, $b) = @_;
+
+	if ($a > $b) {
+		$percent = (($a - $b) / $a) * 100;
+	} else {
+		$percent = (($b - $a) / $b) * 100;
+	}
+	return ($percent <= 20);
+}
+
+sub check_caches
+{
+	if (!&same($lat_l1[$i], $lat_l2[$i]) &&
+	    &same($lat_l2[$i], $lat_mem[$i])) {
+		"    No L2 cache?";
+	} elsif (&same($lat_l1[$i], $lat_l2[$i])) {
+		"    No L1 cache?";
+	} 
+}
diff --git a/performance/lmbench3/scripts/getresults b/performance/lmbench3/scripts/getresults
new file mode 100755
index 0000000..c5665b5
--- /dev/null
+++ b/performance/lmbench3/scripts/getresults
@@ -0,0 +1,99 @@
+#!/usr/bin/perl -ws
+
+# Search through the archives splitting out stuff that has pathnames.
+
+while (1) {
+	&headers;
+	&body;
+}
+
+sub headers
+{
+	while (<>) {
+		warn "HDR $_" if ($debug);
+		return if /^\s*$/;
+	}
+	exit;
+}
+
+# Save the info for the system, skipping everything ig there is no info.
+sub body
+{
+	@info = ();
+	while (<>) {
+		last if m|^[-]+ \.\./results|;
+		last if /^\[lmbench/;
+		if (/^From[: ]/) { warn "FROM $_"; return; }
+		warn "INFO $_" if ($debug);
+		push(@info, $_);
+	}
+	if (/^[-]+ \.\.\/results/) {
+		@foo = split;
+		$path = $foo[1];
+		$path =~ s|\.\./||;
+		warn "PATH $path\n" if ($debug);
+		&results;
+		return;
+	} 
+	warn "SKIPPING one\n";
+	while (<>) {
+		warn "SKIP $_" if ($SKIP);
+		last if /^Memory load latency/;
+		if (/^From[: ]/) { warn "FROM $_"; return; }
+	}
+	die "No memory load latency" unless /^Memory load latency/;
+	while (<>) {
+		warn "SKIP $_" if ($SKIP);
+		last if /^\[/;
+		if (/^From[: ]/) { warn "FROM $_"; return; }
+	}
+	die "No date" unless /^\[/;
+	while (<>) {
+		last unless /^\s*$/;
+		if (/^From[: ]/) { warn "FROM $_"; return; }
+	}
+}
+
+sub results
+{
+	@results = ();
+	while (<>) {
+		goto done if (/^From[: ]/);
+		warn "RES $_" if ($RES);
+		push(@results, $_);
+		last if /^Memory load latency/;
+	}
+	die "No memory load latency" unless /^Memory load latency/;
+	while (<>) {
+		goto done if (/^From[: ]/);
+		warn "RES $_" if ($RES);
+		push(@results, $_);
+		last if /^\[/;
+	}
+	die "No date" unless /^\[/;
+	while (<>) {
+		last unless /^\s*$/;
+	}
+
+done:
+	($dir = $path) =~ s|/[^/]+$||;
+	warn "DIR $dir\n" if ($debug);
+	system "mkdir -p $dir";
+	if (-e $path) {
+		warn "CONFLICT on $path\n" if $debug;
+		for ($i = 0; ; $i++) {
+			$tmp = "${path}.${i}";
+			last if ! -e $tmp;
+			warn "CONFLICT on $tmp\n" if $debug;
+		}
+		$path = $tmp;
+	}
+	$info = $path . ".INFO";
+	open(O, ">$info");
+	print O @info;
+	close(O);
+	warn "Saving $path\n" if $verbose;
+	open(O, ">$path");
+	print O @results;
+	close(O);
+}
diff --git a/performance/lmbench3/scripts/getsummary b/performance/lmbench3/scripts/getsummary
new file mode 100755
index 0000000..43bdae5
--- /dev/null
+++ b/performance/lmbench3/scripts/getsummary
@@ -0,0 +1,1089 @@
+
+# Generate an ascii summary from lmbench result files.
+# Usage: getsummary file file file...
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: getsummary 1.34 05/02/17 16:40:22+02:00 staelin@xxxxxxxxxxxxxxxxxxxxxxxx $
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+# Use these constants to same typo-induced bugs later!
+$M = 1000000.;
+$K = 1000.;
+
+$n = 0;
+foreach $file (@ARGV) {
+	open(FD, $file) || die "$0: can't open $file";
+	$file =~ s/\.\d+$//;
+        @_ = split(/\//, $file);
+	push(@host, $_[$#_]);
+	$file = $_[$#_ - 1];
+	$file =~ s|/|-|;
+	push(@file, $file);
+	$lat_mem_rd_type = -1;
+	$mhz = 0;
+	while (<FD>) {
+		chop;
+		next if m|scripts/lmbench: /dev/tty|;
+		if (/^\[lmbench/) {
+			$version = -1;
+			push(@uname, $_);
+			if (/lmbench1\./) {
+				$version = 1;
+			}
+			if (/lmbench2\./) {
+				$version = 2;
+			}
+			if (/lmbench3\./) {
+				$version = 3;
+			}
+		}
+		if (/MHZ/ && !$mhz) {
+			@_ = split;
+			$_[1] =~ s/\]//;
+			push(@misc_mhz, $_[1]);
+			$mhz = 1;
+		} elsif (/Mhz/ && !$mhz) {
+			@_ = split;
+			push(@misc_mhz, $_[0]);
+			$mhz = 1;
+		}
+		if (/^Select on 100 fd/) {
+			@_ = split;
+			push(@lat_fd_select, $_[4]);
+		}
+		if (/^Select on 100 tcp fd/) {
+			@_ = split;
+			push(@lat_tcp_select, $_[5]);
+		}
+		if (/^integer bit:/) {
+			@_ = split;
+			push(@integer_bit, $_[2]);
+		}
+		if (/^integer add:/) {
+			@_ = split;
+			push(@integer_add, $_[2]);
+		}
+		if (/^integer mul:/) {
+			@_ = split;
+			push(@integer_mul, $_[2]);
+		}
+		if (/^integer div:/) {
+			@_ = split;
+			push(@integer_div, $_[2]);
+		}
+		if (/^integer mod:/) {
+			@_ = split;
+			push(@integer_mod, $_[2]);
+		}
+		if (/^uint64 bit:/) {
+			@_ = split;
+			push(@int64_bit, $_[2]);
+		}
+		if (/^uint64 add:/) {
+			@_ = split;
+			push(@int64_add, $_[2]);
+		}
+		if (/^uint64 mul:/) {
+			@_ = split;
+			push(@int64_mul, $_[2]);
+		}
+		if (/^uint64 div:/) {
+			@_ = split;
+			push(@int64_div, $_[2]);
+		}
+		if (/^uint64 mod:/) {
+			@_ = split;
+			push(@int64_mod, $_[2]);
+		}
+		if (/^float add:/) {
+			@_ = split;
+			push(@float_add, $_[2]);
+		}
+		if (/^float mul:/) {
+			@_ = split;
+			push(@float_mul, $_[2]);
+		}
+		if (/^float div:/) {
+			@_ = split;
+			push(@float_div, $_[2]);
+		}
+		if (/^double add:/) {
+			@_ = split;
+			push(@double_add, $_[2]);
+		}
+		if (/^double mul:/) {
+			@_ = split;
+			push(@double_mul, $_[2]);
+		}
+		if (/^double div:/) {
+			@_ = split;
+			push(@double_div, $_[2]);
+		}
+		if (/^float bogomflops:/) {
+			@_ = split;
+			push(@float_bogomflops, $_[2]);
+		}
+		if (/^double bogomflops:/) {
+			@_ = split;
+			push(@double_bogomflops, $_[2]);
+		}
+		if (/LINE_SIZE/) {
+			@_ = split;
+			$_[1] =~ s/\]//;
+			push(@line_size, $_[1]);
+		}
+		if (/SYNC_MAX/) {
+			@_ = split;
+			$_[1] =~ s/\]//;
+			push(@load, $_[1]);
+		}
+		if (/^tlb:/) {
+			@_ = split;
+			push(@tlb, $_[1]);
+		}
+		if (/^Simple syscall:/) {
+			@_ = split;
+			push(@lat_syscall, $_[2]);
+		}
+		if (/^Simple read:/) {
+			@_ = split;
+			push(@lat_read, $_[2]);
+		}
+		if (/^Simple write:/) {
+			@_ = split;
+			push(@lat_write, $_[2]);
+		}
+		if (/^Simple stat:/) {
+			@_ = split;
+			push(@lat_stat, $_[2]);
+		}
+		if (/^Simple open.close:/) {
+			@_ = split;
+			push(@lat_openclose, $_[2]);
+		}
+		if (/^Null syscall:/) {	# Old format.
+			@_ = split;
+			push(@lat_write, $_[2]);
+		}
+		if (/^Signal handler installation:/) {
+			@_ = split;
+			push(@lat_siginstall, $_[3]);
+		}
+		if (/^Signal handler overhead:/) {
+			@_ = split;
+			push(@lat_sigcatch, $_[3]);
+		}
+		if (/^Protection fault:/) {
+			@_ = split;
+			push(@lat_protfault, $_[2]);
+		}
+		if (/^Pipe latency:/) {
+			@_ = split;
+			push(@lat_pipe, $_[2]);
+		}
+		if (/AF_UNIX sock stream latency:/) {
+			@_ = split;
+			push(@lat_unix, $_[4]);
+		}
+		if (/UDP latency using localhost:/) {
+			@_ = split;
+			push(@lat_udp_local, $_[4]);
+		} elsif (/UDP latency using/) {
+			@_ = split;
+			push(@lat_udp_remote, $_[4]);
+		}
+		if (/TCP latency using localhost:/) {
+			@_ = split;
+			push(@lat_tcp_local, $_[4]);
+		} elsif (/TCP latency using/) {
+			@_ = split;
+			push(@lat_tcp_remote, $_[4]);
+		}
+		if (/RPC.udp latency using localhost:/) {
+			@_ = split;
+			push(@lat_rpc_udp_local, $_[4]);
+		} elsif (/RPC.udp latency using/) {
+			@_ = split;
+			push(@lat_rpc_udp_remote, $_[4]);
+		}
+		if (/RPC.tcp latency using localhost:/) {
+			@_ = split;
+			push(@lat_rpc_tcp_local, $_[4]);
+		} elsif (/RPC.tcp latency using/) {
+			@_ = split;
+			push(@lat_rpc_tcp_remote, $_[4]);
+		}
+		if (/TCP.IP connection cost to localhost:/) {
+			@_ = split;
+			push(@lat_tcp_connect_local, $_[5]);
+		} elsif (/TCP.IP connection cost to/) {
+			@_ = split;
+			push(@lat_tcp_connect_remote, $_[5]);
+		}
+		if (/^Socket bandwidth using localhost/) {
+			$value = &getbiggest("Socket bandwidth using localhost");
+			push(@bw_tcp_local, $value);
+#		} elsif (/^Socket bandwidth using /) {
+#			$value = &getbiggest("Socket bandwidth using remote");
+#			push(@bw_tcp_remote, $value);
+		}
+		if (/^AF_UNIX sock stream bandwidth:/) {
+			@_ = split;
+			push(@bw_unix, $_[4]);
+		}
+		if (/^Process fork.exit/) {
+			@_ = split;
+			push(@lat_nullproc, $_[2]);
+		}
+		if (/^Process fork.execve:/) {
+			@_ = split;
+			push(@lat_simpleproc, $_[2]);
+		}
+		if (/^Process fork..bin.sh/) {
+			@_ = split;
+			push(@lat_shproc, $_[3]);
+		}
+		if (/^Pipe bandwidth/) {
+			@_ = split;
+			push(@bw_pipe, $_[2]);
+		}
+		if (/^File .* write bandwidth/) {
+			@_ = split;
+			$bw = sprintf("%.2f", $_[4] / 1024.);
+			push(@bw_file, $bw);
+		}
+		if (/^Pagefaults on/) {
+			@_ = split;
+			push(@lat_pagefault, $_[3]);
+		}
+		if (/^"mappings/) {
+			$value = &getbiggest("memory mapping timing");
+			push(@lat_mappings, $value);
+		}
+		if (/^"read bandwidth/) {
+			$value = &getbiggest("reread timing");
+			push(@bw_reread, $value);
+		}
+		if (/^"Mmap read bandwidth/) {
+			$value = &getbiggest("mmap reread timing");
+			push(@bw_mmap, $value);
+		}
+		if (/^"libc bcopy unaligned/) {
+			$value = &getbiggest("libc bcopy timing");
+			push(@bw_bcopy_libc, $value);
+		}
+		if (/^"unrolled bcopy unaligned/) {
+			$value = &getbiggest("unrolled bcopy timing");
+			push(@bw_bcopy_unrolled, $value);
+		}
+		if (/^Memory read/) {
+			$value = &getbiggest("memory read & sum timing");
+			push(@bw_mem_rdsum, $value);
+		}
+		if (/^Memory write/) {
+			$value = &getbiggest("memory write timing");
+			push(@bw_mem_wr, $value);
+		}
+		if (/^Memory load parallelism/) {
+			$value = &getbiggest("Memory load parallelism");
+			push(@mem_load_par, $value);
+		}
+
+		if (/^"File system latency/) {
+			while (<FD>) {
+				next if /Id:/;
+				if (/^0k/) {
+					@_ = split;
+					push(@fs_create_0k, $_[2]);
+					push(@fs_delete_0k, $_[3]);
+				} elsif (/^1k/) {
+					@_ = split;
+					push(@fs_create_1k, $_[2]);
+					push(@fs_delete_1k, $_[3]);
+				} elsif (/^4k/) {
+					@_ = split;
+					push(@fs_create_4k, $_[2]);
+					push(@fs_delete_4k, $_[3]);
+				} elsif (/^10k/) {
+					@_ = split;
+					push(@fs_create_10k, $_[2]);
+					push(@fs_delete_10k, $_[3]);
+				} else {
+					last;
+				}
+			}
+		}
+		if (/size=0/) {
+			while (<FD>) {
+				if (/^2 /) {
+					@_ = split; push(@lat_ctx0_2, $_[1]);
+				} elsif (/^8 /) {
+					@_ = split; push(@lat_ctx0_8, $_[1]);
+				} elsif (/^16 /) {
+					@_ = split; push(@lat_ctx0_16, $_[1]);
+				}
+			    	last if /^\s*$/ || /^Memory/;
+			}
+		}
+		if (/size=16/) {
+			while (<FD>) {
+				if (/^2 /) {
+					@_ = split; push(@lat_ctx16_2, $_[1]);
+				} elsif (/^8 /) {
+					@_ = split; push(@lat_ctx16_8, $_[1]);
+				} elsif (/^16 /) {
+					@_ = split; push(@lat_ctx16_16, $_[1]);
+				}
+			    	last if /^\s*$/;
+			}
+		}
+		if (/size=64/) {
+			while (<FD>) {
+				if (/^2 /) {
+					@_ = split; push(@lat_ctx64_2, $_[1]);
+				} elsif (/^8 /) {
+					@_ = split; push(@lat_ctx64_8, $_[1]);
+				} elsif (/^16 /) {
+					@_ = split; push(@lat_ctx64_16, $_[1]);
+				}
+			    	last if /^\s*$/ || /^20/;
+			}
+		}
+		if (/^Memory load latency/) {
+			$lat_mem_rd_type = 1;
+		}
+		if (/^Random load latency/) {
+			$lat_mem_rd_type = 2;
+		}
+		if (/^"stride=128/) {
+			$save = -1;
+			while (<FD>) {
+				if (/^\s*$/) {
+					last;
+				}
+				@_ = split;
+				$size = $_[0];
+				$save = $_[1];
+				if ($size == 0.00098 && $lat_mem_rd_type == 1) {
+					push(@lat_l1, $_[1]);
+				} elsif ($size == 0.12500 && $lat_mem_rd_type == 1) {
+					push(@lat_l2, $_[1]);
+				}
+			}
+			if ($size < 8.0) {
+				warn "$file: No 8MB memory latency, using $size\n";
+			}
+			if ($lat_mem_rd_type == 1) {
+				push(@lat_mem, $save);
+			}
+		}
+		if (/^"stride=16/) {
+			$save = -1;
+			while (<FD>) {
+				if (/^\s*$/) {
+					last;
+				}
+				@_ = split;
+				$size = $_[0];
+				$save = $_[1];
+			}
+			if ($size < 8.0) {
+				warn "$file: No 8MB random access memory latency, using $size\n";
+			}
+			if ($lat_mem_rd_type == 2) {
+				warn "$file: lat_mem_rand = $save\n";
+				push(@lat_mem_rand, $save);
+			}
+		}
+	}
+	@warn = ();
+	foreach $array (
+		'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_file',
+		'bw_mem_rdsum', 'bw_mem_wr', 'bw_mmap', 'bw_pipe',
+		'bw_reread', 'bw_tcp_local', 'bw_tcp_remote', 'bw_unix', 
+		'double_add', 'double_bogomflops', 'double_div', 'double_mul',
+		'float_add', 'float_bogomflops', 'float_div', 'float_mul',
+		'fs_create_0k', 'fs_create_1k', 'fs_create_4k',
+		'fs_create_10k', 'fs_delete_0k', 'fs_delete_1k',
+		'fs_delete_4k', 'fs_delete_10k', 'integer_add',
+		'integer_bit', 'integer_div', 'integer_mod', 'integer_mul',
+		'lat_ctx0_2', 'lat_ctx0_8', 'lat_ctx0_16',
+		'lat_ctx16_2', 'lat_ctx16_8', 'lat_ctx16_16',
+		'lat_ctx64_2', 'lat_ctx64_8', 'lat_ctx64_16', 
+		'lat_l1', 'lat_l2', 'lat_mappings', 'lat_mem', 
+		'lat_mem_rand', 'lat_nullproc',
+		'lat_openclose', 'lat_pagefault', 'lat_pipe',
+		'lat_protfault', 'lat_read', 'lat_rpc_tcp_local', 
+		'lat_rpc_tcp_remote', 'lat_rpc_udp_local', 
+		'lat_rpc_udp_remote', 'lat_fd_select', 'lat_tcp_select',
+		'lat_shproc', 'lat_sigcatch', 'lat_siginstall', 
+		'lat_simpleproc', 'lat_stat', 'lat_syscall', 
+		'lat_tcp_connect_local', 'lat_tcp_connect_remote', 
+		'lat_tcp_local', 'lat_tcp_remote',
+		'lat_udp_local', 'lat_udp_remote', 'lat_unix', 'lat_write', 
+		'line_size', 'mem_load_par', 'misc_mhz', 'tlb', 'load',
+		'int64_add', 'int64_bit', 'int64_div', 'int64_mod',
+		'int64_mul'
+	) {
+		$last = eval '$#' . $array;
+		if ($last != $n) {
+			#warn "No data for $array in $file\n";
+			push(@warn, $array);
+			eval 'push(@' . $array . ', -1);';
+		}
+	}
+#	if ($#warn != -1) {
+#		warn "Missing data in $file: @warn\n";
+#	}
+	$n++;
+}
+
+print<<EOF;
+
+                 L M B E N C H  3 . 0   S U M M A R Y
+                 ------------------------------------
+		 (Alpha software, do not distribute)
+
+EOF
+
+&print_basic;
+&print_process;
+&print_int;
+&print_uint64;
+&print_float;
+&print_double;
+&print_ctx;
+&print_ipc_local;
+&print_ipc_remote;
+&print_file_vm;
+&print_bw_ipc_local;
+&print_mem;
+
+exit 0;
+
+sub print_basic
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'tlb', 'line_size', 'mem_load_par' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+Basic system parameters
+------------------------------------------------------------------------------
+Host                 OS Description              Mhz  tlb  cache  mem   scal
+                                                     pages line   par   load
+                                                           bytes  
+--------- ------------- ----------------------- ---- ----- ----- ------ ----
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'tlb', 'line_size', 'mem_load_par' )) <= 0) {
+			next; 
+		}
+	        printf "%-9.9s %13.13s %23.23s ", 
+			$host[$i], &getos($uname[$i]), $file[$i];
+		printf "%4.4s %5.5s %5.5s %6.6s %4.4s\n",
+			&inum($misc_mhz[$i], 4),
+			&inum($tlb[$i], 5),
+			&inum($line_size[$i], 5),
+			&num($mem_load_par[$i], 6),
+			&inum($load[$i], 4);
+	}
+}
+
+sub print_process
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'lat_syscall', 'lat_read', 'lat_write',
+				'lat_stat', 'lat_openclose', 'lat_tcp_select',
+				'lat_siginstall', 'lat_sigcatch', 
+				'lat_nullproc', 'lat_simpleproc', 
+				'lat_shproc' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+Processor, Processes - times in microseconds - smaller is better
+------------------------------------------------------------------------------
+Host                 OS  Mhz null null      open slct sig  sig  fork exec sh  
+                             call  I/O stat clos TCP  inst hndl proc proc proc
+--------- ------------- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
+EOF
+
+	@fs_delete_4k = @lat_ctx0_8 = @bw_file = @lat_ctx0_16 = @fs_delete_1k =
+	@fs_create_4k = @fs_create_1k
+	if 0;	# lint
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'lat_syscall', 'lat_read', 'lat_write',
+				'lat_stat', 'lat_openclose', 'lat_tcp_select',
+				'lat_siginstall', 'lat_sigcatch', 
+				'lat_nullproc', 'lat_simpleproc', 
+				'lat_shproc' )) <= 0) { 
+			next;
+		}
+		# If they have no /dev/zero, use /dev/null, else average them.
+		if ($lat_read[$i] == -1) {
+			$tmp = $lat_write[$i];
+		} else {
+			$tmp = ($lat_read[$i] + $lat_write[$i]) / 2;
+		}
+	        printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+		printf "%4.0f %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s %4.4s\n",
+			$misc_mhz[$i],
+			&num($lat_syscall[$i], 4),
+			&num($tmp, 4),
+			&num($lat_stat[$i], 4),
+			&num($lat_openclose[$i], 4),
+			&num($lat_tcp_select[$i], 4),
+			&num($lat_siginstall[$i], 4),
+			&num($lat_sigcatch[$i], 4),
+			&num($lat_nullproc[$i], 4),
+			&num($lat_simpleproc[$i], 4),
+			&num($lat_shproc[$i], 4);
+	}
+}
+
+sub print_int
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'integer_bit', 'integer_add', 
+				'integer_mul', 'integer_div',
+				'integer_mod' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+Basic integer operations - times in nanoseconds - smaller is better
+-------------------------------------------------------------------
+Host                 OS  intgr intgr  intgr  intgr  intgr  
+                          bit   add    mul    div    mod   
+--------- ------------- ------ ------ ------ ------ ------ 
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'integer_bit', 'integer_add', 
+					'integer_mul', 'integer_div',
+					'integer_mod' )) <= 0) { 
+			next; 
+		}
+		printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+		printf "%6.6s %6.6s %6.6s %6.6s %6.6s\n",
+			&scale_num($integer_bit[$i], 6, $load[$i]),
+			&scale_num($integer_add[$i], 6, $load[$i]),
+			&scale_num($integer_mul[$i], 6, $load[$i]),
+			&scale_num($integer_div[$i], 6, $load[$i]),
+			&scale_num($integer_mod[$i], 6, $load[$i]);
+	}
+}
+
+sub print_uint64
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'int64_bit', 'int64_add', 
+				'int64_mul', 'int64_div',
+				'int64_mod' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+Basic uint64 operations - times in nanoseconds - smaller is better
+------------------------------------------------------------------
+Host                 OS int64  int64  int64  int64  int64  
+                         bit    add    mul    div    mod   
+--------- ------------- ------ ------ ------ ------ ------ 
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'int64_bit', 'int64_add', 
+					'int64_mul', 'int64_div',
+					'int64_mod' )) <= 0) { 
+			next; 
+		}
+        	printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+		printf " %5.5s %6.6s %6.6s %6.6s %6.6s\n",
+			&scale_num($int64_bit[$i], 6, $load[$i]),
+			&scale_num($int64_add[$i], 6, $load[$i]),
+			&scale_num($int64_mul[$i], 6, $load[$i]),
+			&scale_num($int64_div[$i], 6, $load[$i]),
+			&scale_num($int64_mod[$i], 6, $load[$i]);
+	}
+}
+
+sub print_float
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'float_add', 'float_mul', 'float_div',
+					'float_bogomflops' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+Basic float operations - times in nanoseconds - smaller is better
+-----------------------------------------------------------------
+Host                 OS  float  float  float  float
+                         add    mul    div    bogo
+--------- ------------- ------ ------ ------ ------ 
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'float_add', 'float_mul', 
+					'float_div', 
+					'float_bogomflops' )) <= 0) { 
+			next; 
+		}
+	        printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+		printf "%6.6s %6.6s %6.6s %6.6s\n",
+			&scale_num($float_add[$i], 6, $load[$i]),
+			&scale_num($float_mul[$i], 6, $load[$i]),
+			&scale_num($float_div[$i], 6, $load[$i]),
+			&scale_num($float_bogomflops[$i], 6, $load[$i]);
+	}
+}
+
+sub print_double
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'double_add', 'double_mul', 'double_div',
+					'double_bogomflops' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+Basic double operations - times in nanoseconds - smaller is better
+------------------------------------------------------------------
+Host                 OS  double double double double
+                         add    mul    div    bogo
+--------- ------------- ------  ------ ------ ------ 
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'double_add', 'double_mul', 
+					'double_div', 
+					'double_bogomflops' )) <= 0) { 
+			next; 
+		}
+	        printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+		printf "%6.6s %6.6s %6.6s %6.6s\n",
+			&scale_num($double_add[$i], 6, $load[$i]),
+			&scale_num($double_mul[$i], 6, $load[$i]),
+			&scale_num($double_div[$i], 6, $load[$i]),
+			&scale_num($double_bogomflops[$i], 6, $load[$i]);
+	}
+}
+
+sub print_ctx
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'lat_ctx0_2', 'lat_ctx16_2',
+				'lat_ctx64_2', 'lat_ctx16_8',
+				'lat_ctx64_8', 'lat_ctx16_16',
+				'lat_ctx64_16' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+Context switching - times in microseconds - smaller is better
+-------------------------------------------------------------------------
+Host                 OS  2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K
+                         ctxsw  ctxsw  ctxsw ctxsw  ctxsw   ctxsw   ctxsw
+--------- ------------- ------ ------ ------ ------ ------ ------- -------
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'lat_ctx0_2', 'lat_ctx16_2',
+				'lat_ctx64_2', 'lat_ctx16_8',
+				'lat_ctx64_8', 'lat_ctx16_16',
+				'lat_ctx64_16' )) <= 0) { 
+			next; 
+		}
+	        printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+		printf "%6.6s %6.6s %6.6s %6.6s %6.6s %7.7s %7.7s\n",
+			&num($lat_ctx0_2[$i], 6),
+			&num($lat_ctx16_2[$i], 6),
+			&num($lat_ctx64_2[$i], 6),
+			&num($lat_ctx16_8[$i], 6),
+			&num($lat_ctx64_8[$i], 6),
+			&num($lat_ctx16_16[$i], 7),
+			&num($lat_ctx64_16[$i], 7);
+	}
+}
+
+sub print_ipc_local
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'lat_ctx0_2', 'lat_pipe', 
+				'lat_unix', 'lat_udp_local',
+				'lat_rpc_udp_local', 'lat_tcp_local',
+				'lat_rpc_tcp_local', 
+				'lat_tcp_connect_local' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+*Local* Communication latencies in microseconds - smaller is better
+---------------------------------------------------------------------
+Host                 OS 2p/0K  Pipe AF     UDP  RPC/   TCP  RPC/ TCP
+                        ctxsw       UNIX         UDP         TCP conn
+--------- ------------- ----- ----- ---- ----- ----- ----- ----- ----
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'lat_ctx0_2', 'lat_pipe', 
+				'lat_unix', 'lat_udp_local',
+				'lat_rpc_udp_local', 'lat_tcp_local',
+				'lat_rpc_tcp_local', 
+				'lat_tcp_connect_local' )) <= 0) { 
+			next; 
+		}
+	        printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+        	printf "%5.5s %5.5s %4.4s %5.5s %5.5s %5.5s %5.5s %4.4s\n",
+			&num($lat_ctx0_2[$i], 5),
+			&num($lat_pipe[$i], 5),
+			&num($lat_unix[$i], 4),
+			&num($lat_udp_local[$i], 5),
+			&num($lat_rpc_udp_local[$i], 5),
+			&num($lat_tcp_local[$i], 5),
+			&num($lat_rpc_tcp_local[$i], 5),
+			&scale_num($lat_tcp_connect_local[$i], 5, $load[$i]);
+	}
+}
+
+sub print_ipc_remote
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'lat_udp_remote', 
+				'lat_rpc_udp_remote', 'lat_tcp_remote', 
+				'lat_rpc_tcp_remote', 
+				'lat_tcp_connect_remote' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+*Remote* Communication latencies in microseconds - smaller is better
+---------------------------------------------------------------------
+Host                 OS   UDP  RPC/  TCP   RPC/ TCP
+                               UDP         TCP  conn
+--------- ------------- ----- ----- ----- ----- ----
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'lat_udp_remote', 
+				'lat_rpc_udp_remote', 'lat_tcp_remote', 
+				'lat_rpc_tcp_remote', 
+				'lat_tcp_connect_remote' )) <= 0) { 
+			next; 
+		}
+	        printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+	        printf "%5.5s %5.5s %5.5s %5.5s %4.4s\n",
+			&num($lat_udp_remote[$i], 5),
+			&num($lat_rpc_udp_remote[$i], 5),
+			&num($lat_tcp_remote[$i], 5),
+			&num($lat_rpc_tcp_remote[$i], 5),
+			&scale_num($lat_tcp_connect_remote[$i], 4, $load[$i]);
+	}
+}
+
+sub print_file_vm
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'fs_create_0k', 'fs_create_10k',
+				'fs_delete_0k', 'fs_delete_10k',
+				'lat_mappings', 'lat_protfault',
+				'lat_pagefault' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+File & VM system latencies in microseconds - smaller is better
+-------------------------------------------------------------------------------
+Host                 OS   0K File      10K File     Mmap    Prot   Page   100fd
+                        Create Delete Create Delete Latency Fault  Fault  selct
+--------- ------------- ------ ------ ------ ------ ------- ----- ------- -----
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'fs_create_0k', 'fs_create_10k',
+				'fs_delete_0k', 'fs_delete_10k',
+				'lat_mappings', 'lat_protfault',
+				'lat_pagefault' )) <= 0) { 
+			next; 
+		}
+	        printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+		$c0k = $fs_create_0k[$i] <= 0 ? -1 : $M / $fs_create_0k[$i];
+		$c10k = $fs_create_10k[$i] <= 0 ? -1 : $M / $fs_create_10k[$i];
+		$d0k = $fs_delete_0k[$i] <= 0 ? -1 : $M / $fs_delete_0k[$i];
+		$d10k = $fs_delete_10k[$i] <= 0 ? -1 : $M / $fs_delete_10k[$i];
+		printf "%6.6s %6.6s %6.6s %6.6s %7.7s %5.5s %7.7s %5.5s\n",
+			&scale_num($c0k, 6, $load[$i]),
+			&scale_num($d0k, 6, $load[$i]),
+			&scale_num($c10k, 6, $load[$i]),
+			&scale_num($d10k, 6, $load[$i]),
+			&num($lat_mappings[$i], 7),
+			&num($lat_protfault[$i], 5),
+			&num($lat_pagefault[$i], 7),
+			&num($lat_fd_select[$i], 5);
+	}
+}
+
+sub print_bw_ipc_local
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'bw_pipe', 'bw_unix', 
+				'bw_tcp_local', 'bw_reread',
+				'bw_bcopy_libc', 'bw_bcopy_unrolled',
+				'bw_mem_rdsum' , 'bw_mem_wr' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+*Local* Communication bandwidths in MB/s - bigger is better
+-----------------------------------------------------------------------------
+Host                OS  Pipe AF    TCP  File   Mmap  Bcopy  Bcopy  Mem   Mem
+                             UNIX      reread reread (libc) (hand) read write
+--------- ------------- ---- ---- ---- ------ ------ ------ ------ ---- -----
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'bw_pipe', 'bw_unix', 
+				'bw_tcp_local', 'bw_reread',
+				'bw_bcopy_libc', 'bw_bcopy_unrolled',
+				'bw_mem_rdsum' , 'bw_mem_wr' )) <= 0) { 
+			next; 
+		}
+	        printf "%-9.9s %13.13s ", $host[$i], &getos($uname[$i]);
+	        printf "%4.4s %4.4s %4.4s %6.6s %6.6s %6.6s %6.6s %4.4s %5.5s\n",
+			&num($bw_pipe[$i], 4),
+			&num($bw_unix[$i], 4),
+			&num($bw_tcp_local[$i], 4),
+			&num($bw_reread[$i], 6),
+			&num($bw_mmap[$i], 6),
+			&num($bw_bcopy_libc[$i], 6),
+			&num($bw_bcopy_unrolled[$i], 6),
+			&num($bw_mem_rdsum[$i], 4),
+			&num($bw_mem_wr[$i], 5);
+	}
+}
+
+sub print_mem
+{
+	local($i);
+	local($t);
+	
+	if (&resultsq(0, $#uname, ( 'lat_l1', 'lat_l2', 'lat_mem' )) <= 0) { 
+		return; 
+	}
+	print<<EOF;
+
+Memory latencies in nanoseconds - smaller is better
+    (WARNING - may not be correct, check graphs)
+------------------------------------------------------------------------------
+Host                 OS   Mhz   L1 \$   L2 \$    Main mem    Rand mem    Guesses
+--------- -------------   ---   ----   ----    --------    --------    -------
+EOF
+
+	for ($i = 0; $i <= $#uname; $i++) {
+		if (&resultsq($i, $i, ( 'lat_l1', 'lat_l2', 'lat_mem' )) <= 0) { 
+			next; 
+		}
+	        printf "%-9.9s %13.13s  %4d",
+		    $host[$i], &getos($uname[$i]), $misc_mhz[$i];
+		$msg = &check_caches;
+		if ($lat_l1[$i] < 0) {
+	        	printf "%6s %6s %11s    %s",
+			    "-", "-", "-",
+			    "Bad mhz?";
+		} else {
+			printf " %6.6s %6.6s %6.6s %11.11s",
+		            &num($lat_l1[$i], 6),
+		            &num($lat_l2[$i], 6),
+		            &num($lat_mem[$i], 6),
+		            &num($lat_mem_rand[$i], 6);
+			print $msg if ($msg =~ /L/);
+		}
+		print "\n";
+	}
+}
+
+
+# checks to see if there are any valid results
+# 
+sub resultsq
+{
+	local($low, $high, @pars) = @_;
+	local($i);
+	local($val);
+
+	for ($i = $low; $i <= $high; $i++) {
+		foreach $p (@pars) {
+			$val = eval '$' . $p . '[' . $i . ']';
+			if ($val > 0) {
+				return (1);
+			}
+		}
+	}
+	return (0);
+}
+
+# (33, %3d)
+sub inum
+{
+	local($val, $len) = @_;
+	local($str) = "";
+	local($i);
+
+	if (!defined($val) || !($val =~ /^[ 	]*[0-9.]+[ 	]*$/)) {
+		$val = -1;
+	}
+	if ($val <= 0) {
+		$str = "";
+		for ($i = 0; $i < $len; $i++) {
+			$str .= " ";
+		}
+		return ($str);
+	}
+
+	$fmt = sprintf("%%%dd", $len);
+	$str = sprintf($fmt, $val);
+
+	$str;
+}
+# (33, %3d, scale)
+sub scale_num
+{
+	local($val, $len, $scale) = @_;
+
+	if ($scale > 1) {
+		$val = -1
+	}
+	return (&num($val, $len));
+}
+# (33, %3d)
+sub num
+{
+	local($val, $len) = @_;
+	local($str) = "";
+	local($i);
+
+	if (!defined($val) || !($val =~ /^[ 	]*[0-9.]+[ 	]*$/)) {
+		$val = -1;
+	}
+	if ($val <= 0) {
+		$str = "";
+		for ($i = 0; $i < $len; $i++) {
+			$str .= " ";
+		}
+		return ($str);
+	}
+	if ($val >= 10 * $M) {
+		$nstr = sprintf("%.1f", $val / $M);
+		$fmt = sprintf("%%%d.%ds%%s", $len - 1, $len - 1);
+		$str = sprintf($fmt, $nstr, "M");
+	} elsif ($val >= 10 * $K) {
+		$nstr = sprintf("%.1f", $val / $K);
+		$fmt = sprintf("%%%d.%ds%%s", $len - 1, $len - 1);
+		$str = sprintf($fmt, $nstr, "K");
+	} elsif ($val >= 10) {
+		$nstr = sprintf("%.1f", $val);
+		$fmt = sprintf("%%%d.%ds", $len, $len);
+		$str = sprintf($fmt, $nstr);
+	} elsif ($val < 0.001) {
+		$fmt = sprintf("%%%d.%de", $len, $len - 6);
+		$str = sprintf($fmt, $val);
+	} else {
+		$fmt = sprintf("%%%d.%df", $len, $len - 2);
+		$str = sprintf($fmt, $val);
+	}
+	$str;
+}
+
+# Input looks like
+# "benchmark name
+# size value
+# ....
+# <blank line>
+#
+# Return the biggest value before the blank line.
+sub getbiggest
+{
+	local($msg) = @_;
+	local($line) = 0;
+
+	undef $save;
+	$value = 0;
+	while (<FD>) {
+		$line++;
+		#warn "$line $_";
+		last if /^\s*$/;
+		last if (!($_ =~ /^\d+/));
+		$save = $_ if /^\d+\./;
+	}
+	if (defined $save) {
+		$_ = $save;
+		@d = split;
+		$value = $d[1];
+		if (int($d[0]) < 4) {
+			warn "$file: using $d[0] size for $msg\n";
+		}
+	} else {
+		warn "$file: no data for $msg\n";
+	}
+	$value;
+}
+
+
+# Try and create sensible names from uname -a output
+sub getos
+{
+        local(@info);
+
+        @info = split(/\s+/, $_[0]);
+        "$info[3] $info[5]";
+}
+
+# Return true if the values differe by less than 10%
+sub same
+{
+	local($a, $b) = @_;
+
+	if ($a > $b) {
+		$percent = (($a - $b) / $a) * 100;
+	} else {
+		$percent = (($b - $a) / $b) * 100;
+	}
+	return ($percent <= 20);
+}
+
+sub check_caches
+{
+	if (!&same($lat_l1[$i], $lat_l2[$i]) &&
+	    &same($lat_l2[$i], $lat_mem[$i])) {
+		"    No L2 cache?";
+	} elsif (&same($lat_l1[$i], $lat_l2[$i])) {
+		"    No L1 cache?";
+	}
+}
diff --git a/performance/lmbench3/scripts/gifs b/performance/lmbench3/scripts/gifs
new file mode 100755
index 0000000..6691b58
--- /dev/null
+++ b/performance/lmbench3/scripts/gifs
@@ -0,0 +1,33 @@
+
+# Make HTML files that will point to the right GIF files.
+# Usage: bghtml file file file....
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1995 Larry McVoy.  GPLed software.
+# $Id: gifs 1.4 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+&pbm;
+exit 0;
+
+sub pbm 
+{
+	@ctx = <HTML/ctx*.pbm>; pop(@ctx);
+	@mem = <HTML/mem*.pbm>; pop(@mem);
+	@bar = <HTML/bar*.pbm>; pop(@bar);
+
+	foreach $i (<HTML/*.pbm>) {
+		($out = $i) =~ s/.pbm//;
+		warn "Bitmap munging $out\n";
+		#system "pnmcrop < $i | ppmtogif -transparent 1,1,1 > $out";
+		system "
+pnmcrop < $i > HTML/___tmp 2>/dev/null
+set `pnmfile HTML/___tmp`
+newx=`expr \$4 - 2`
+newy=`expr \$6 - 2`
+pnmcut 1 1 \$newx \$newy < HTML/___tmp > HTML/___tmp.pnm
+convert -mattecolor slategrey -frame 15x15+0+6 HTML/___tmp.pnm HTML/___tmp.ppm
+ppmtogif < HTML/___tmp.ppm > $out.gif 2>/dev/null";
+	}
+}
diff --git a/performance/lmbench3/scripts/gnu-os b/performance/lmbench3/scripts/gnu-os
new file mode 100755
index 0000000..f2f8819
--- /dev/null
+++ b/performance/lmbench3/scripts/gnu-os
@@ -0,0 +1,1439 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+
+timestamp='2004-08-18'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Originally written by Per Bothner <per@xxxxxxxxxxx>.
+# Please send patches to <config-patches@xxxxxxx>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
+#
+# The plan is that this can be called by configure scripts if you
+# don't specify an explicit build system type.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system \`$me' is run on.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@xxxxxxx>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit 0 ;;
+    --version | -v )
+       echo "$version" ; exit 0 ;;
+    --help | --h* | -h )
+       echo "$usage"; exit 0 ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+trap 'exit 1' 1 2 15
+
+for t in /usr/tmp /var/tmp /tmp; do
+	if [ -d $t -a -w $t ]
+	then	TMPDIR=$t
+		break
+	fi
+done
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d -q "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+	for c in cc gcc c89 c99 ; do
+	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	     CC_FOR_BUILD="$c"; break ;
+	  fi ;
+	done ;
+	if test x"$CC_FOR_BUILD" = x ; then
+	  CC_FOR_BUILD=no_compiler_found ;
+	fi
+	;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ;'
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@xxxxxxxxxxxxxxx 1994-08-24)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	sysctl="sysctl -n hw.machine_arch"
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+	case "${UNAME_MACHINE_ARCH}" in
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently, or will in the future.
+	case "${UNAME_MACHINE_ARCH}" in
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		eval $set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep __ELF__ >/dev/null
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+	        os=netbsd
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case "${UNAME_VERSION}" in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	echo "${machine}-${os}${release}"
+	exit 0 ;;
+    amiga:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    arc:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    hp300:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mac68k:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    macppc:OpenBSD:*:*)
+	echo powerpc-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvme68k:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvme88k:OpenBSD:*:*)
+	echo m88k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvmeppc:OpenBSD:*:*)
+	echo powerpc-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    pegasos:OpenBSD:*:*)
+	echo powerpc-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    pmax:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    sgi:OpenBSD:*:*)
+	echo mipseb-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    sun3:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    wgrisc:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    *:OpenBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    alpha:OSF1:*:*)
+	if test $UNAME_RELEASE = "V4.0"; then
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+	fi
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case "$ALPHA_CPU_TYPE" in
+	    "EV4 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE="alphaev5" ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE="alphaev56" ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE="alphapca56" ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE="alphapca57" ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE="alphaev6" ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE="alphaev67" ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE="alphaev69" ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE="alphaev7" ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE="alphaev79" ;;
+	esac
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	exit 0 ;;
+    Alpha*:OpenVMS:*:*)
+	echo alpha-hp-vms
+	exit 0 ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit 0 ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit 0 ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-unknown-sysv4
+	exit 0;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit 0 ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-morphos
+	exit 0 ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit 0 ;;
+    *:OS400:*:*)
+        echo powerpc-ibm-os400
+	exit 0 ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit 0;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit 0;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@xxxxxxxxxxxxxxxxxxxx (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit 0 ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit 0 ;;
+    DRS?6000:unix:4.0:6*)
+	echo sparc-icl-nx6
+	exit 0 ;;
+    DRS?6000:UNIX_SV:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) echo sparc-icl-nx7 && exit 0 ;;
+	esac ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    i86pc:SunOS:5.*:*)
+	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit 0 ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit 0 ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit 0 ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit 0 ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit 0 ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+        exit 0 ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit 0 ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+        echo m68k-milan-mint${UNAME_RELEASE}
+        exit 0 ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+        echo m68k-hades-mint${UNAME_RELEASE}
+        exit 0 ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+        echo m68k-unknown-mint${UNAME_RELEASE}
+        exit 0 ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit 0 ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit 0 ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit 0 ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit 0 ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit 0 ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c \
+	  && $dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \
+	  && exit 0
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit 0 ;;
+    Motorola:PowerMAX_OS:*:*)
+	echo powerpc-motorola-powermax
+	exit 0 ;;
+    Motorola:*:4.3:PL8-*)
+	echo powerpc-harris-powermax
+	exit 0 ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	echo powerpc-harris-powermax
+	exit 0 ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit 0 ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit 0 ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit 0 ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit 0 ;;
+    AViiON:dgux:*:*)
+        # DG/UX returns AViiON for all architectures
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+ 	exit 0 ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit 0 ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit 0 ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit 0 ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit 0 ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit 0 ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix      # uname -m gives an 8 hex-code CPU id
+	exit 0 ;;              # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	echo i386-ibm-aix
+	exit 0 ;;
+    ia64:AIX:*:*)
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	exit 0 ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		eval $set_cc_for_build
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		$CC_FOR_BUILD -o $dummy $dummy.c && $dummy && exit 0
+		echo rs6000-ibm-aix3.2.5
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit 0 ;;
+    *:AIX:*:[45])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit 0 ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit 0 ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit 0 ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit 0 ;;                           # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit 0 ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit 0 ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit 0 ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit 0 ;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if [ -x /usr/bin/getconf ]; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+                    case "${sc_cpu_version}" in
+                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+                      532)                      # CPU_PA_RISC2_0
+                        case "${sc_kernel_bits}" in
+                          32) HP_ARCH="hppa2.0n" ;;
+                          64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+                        esac ;;
+                    esac
+		fi
+		if [ "${HP_ARCH}" = "" ]; then
+		    eval $set_cc_for_build
+		    sed 's/^              //' << EOF >$dummy.c
+
+              #define _HPUX_SOURCE
+              #include <stdlib.h>
+              #include <unistd.h>
+
+              int main ()
+              {
+              #if defined(_SC_KERNEL_BITS)
+                  long bits = sysconf(_SC_KERNEL_BITS);
+              #endif
+                  long cpu  = sysconf (_SC_CPU_VERSION);
+
+                  switch (cpu)
+              	{
+              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+              	case CPU_PA_RISC2_0:
+              #if defined(_SC_KERNEL_BITS)
+              	    switch (bits)
+              		{
+              		case 64: puts ("hppa2.0w"); break;
+              		case 32: puts ("hppa2.0n"); break;
+              		default: puts ("hppa2.0"); break;
+              		} break;
+              #else  /* !defined(_SC_KERNEL_BITS) */
+              	    puts ("hppa2.0"); break;
+              #endif
+              	default: puts ("hppa1.0"); break;
+              	}
+                  exit (0);
+              }
+EOF
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if [ ${HP_ARCH} = "hppa2.0w" ]
+	then
+	    # avoid double evaluation of $set_cc_for_build
+	    test -n "$CC_FOR_BUILD" || eval $set_cc_for_build
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E -) | grep __LP64__ >/dev/null
+	    then
+		HP_ARCH="hppa2.0w"
+	    else
+		HP_ARCH="hppa64"
+	    fi
+	fi
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit 0 ;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux${HPUX_REV}
+	exit 0 ;;
+    3050*:HI-UX:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c && $dummy && exit 0
+	echo unknown-hitachi-hiuxwe2
+	exit 0 ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit 0 ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit 0 ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit 0 ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit 0 ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit 0 ;;
+    i*86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit 0 ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit 0 ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+        exit 0 ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+        exit 0 ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+        exit 0 ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+        exit 0 ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+        exit 0 ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    *:UNICOS/mp:*:*)
+	echo nv1-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+        exit 0 ;;
+    5000:UNIX_System_V:4.*:*)
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit 0 ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    *:FreeBSD:*:*)
+	# Determine whether the default compiler uses glibc.
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <features.h>
+	#if __GLIBC__ >= 2
+	LIBC=gnu
+	#else
+	LIBC=
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=`
+	# GNU/KFreeBSD systems have a "k" prefix to indicate we are using
+	# FreeBSD's kernel, but not the complete OS.
+	case ${LIBC} in gnu) kernel_only='k' ;; esac
+	echo ${UNAME_MACHINE}-unknown-${kernel_only}freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`${LIBC:+-$LIBC}
+	exit 0 ;;
+    i*:CYGWIN*:*)
+	echo ${UNAME_MACHINE}-pc-cygwin
+	exit 0 ;;
+    i*:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit 0 ;;
+    i*:PW*:*)
+	echo ${UNAME_MACHINE}-pc-pw32
+	exit 0 ;;
+    x86:Interix*:[34]*)
+	echo i586-pc-interix${UNAME_RELEASE}|sed -e 's/\..*//'
+	exit 0 ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+	echo i${UNAME_MACHINE}-pc-mks
+	exit 0 ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i586-pc-interix
+	exit 0 ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit 0 ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit 0 ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    *:GNU:*:*)
+	# the GNU system
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit 0 ;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	exit 0 ;;
+    i*86:Minix:*:*)
+	echo ${UNAME_MACHINE}-pc-minix
+	exit 0 ;;
+    arm*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    cris:Linux:*:*)
+	echo cris-axis-linux-gnu
+	exit 0 ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    mips:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips
+	#undef mipsel
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mipsel
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=`
+	test x"${CPU}" != x && echo "${CPU}-unknown-linux-gnu" && exit 0
+	;;
+    mips64:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips64
+	#undef mips64el
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mips64el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips64
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=`
+	test x"${CPU}" != x && echo "${CPU}-unknown-linux-gnu" && exit 0
+	;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
+	exit 0 ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit 0 ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit 0 ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
+	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
+	  *)    echo hppa-unknown-linux-gnu ;;
+	esac
+	exit 0 ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
+	exit 0 ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	echo ${UNAME_MACHINE}-ibm-linux
+	exit 0 ;;
+    sh64*:Linux:*:*)
+    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    sh*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    x86_64:Linux:*:*)
+	echo x86_64-unknown-linux-gnu
+	exit 0 ;;
+    i*86:Linux:*:*)
+	# The BFD linker knows what the default object file format is, so
+	# first see if it will tell us. cd to the root directory to prevent
+	# problems with other programs or directories called `ld' in the path.
+	# Set LC_ALL=C to ensure ld outputs messages in English.
+	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
+			 | sed -ne '/supported targets:/!d
+				    s/[ 	][ 	]*/ /g
+				    s/.*supported targets: *//
+				    s/ .*//
+				    p'`
+        case "$ld_supported_targets" in
+	  elf32-i386)
+		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
+		;;
+	  a.out-i386-linux)
+		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
+		exit 0 ;;
+	  coff-i386)
+		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
+		exit 0 ;;
+	  "")
+		# Either a pre-BFD a.out linker (linux-gnuoldld) or
+		# one that does not give us useful --help.
+		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
+		exit 0 ;;
+	esac
+	# Determine whether the default compiler is a.out or elf
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <features.h>
+	#ifdef __ELF__
+	# ifdef __GLIBC__
+	#  if __GLIBC__ >= 2
+	LIBC=gnu
+	#  else
+	LIBC=gnulibc1
+	#  endif
+	# else
+	LIBC=gnulibc1
+	# endif
+	#else
+	#ifdef __INTEL_COMPILER
+	LIBC=gnu
+	#else
+	LIBC=gnuaout
+	#endif
+	#endif
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=`
+	test x"${LIBC}" != x && echo "${UNAME_MACHINE}-pc-linux-${LIBC}" && exit 0
+	test x"${TENTATIVE}" != x && echo "${TENTATIVE}" && exit 0
+	;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	echo i386-sequent-sysv4
+	exit 0 ;;
+    i*86:UNIX_SV:4.2MP:2.*)
+        # Unixware is an offshoot of SVR4, but it has its own version
+        # number series starting with 2...
+        # I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+        # Use sysv4.2uw... so that sysv4* matches it.
+	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+	exit 0 ;;
+    i*86:OS/2:*:*)
+	# If we were able to find `uname', then EMX Unix compatibility
+	# is probably installed.
+	echo ${UNAME_MACHINE}-pc-os2-emx
+	exit 0 ;;
+    i*86:XTS-300:*:STOP)
+	echo ${UNAME_MACHINE}-unknown-stop
+	exit 0 ;;
+    i*86:atheos:*:*)
+	echo ${UNAME_MACHINE}-unknown-atheos
+	exit 0 ;;
+	i*86:syllable:*:*)
+	echo ${UNAME_MACHINE}-pc-syllable
+	exit 0 ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    i*86:*DOS:*:*)
+	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	exit 0 ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+	else
+		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+	fi
+	exit 0 ;;
+    i*86:*:5:[78]*)
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	exit 0 ;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+	else
+		echo ${UNAME_MACHINE}-pc-sysv32
+	fi
+	exit 0 ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+        # uname -m prints for DJGPP always 'pc', but it prints nothing about
+        # the processor, so we play safe by assuming i386.
+	echo i386-pc-msdosdjgpp
+        exit 0 ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit 0 ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit 0 ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit 0 ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit 0 ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	echo m68k-convergent-sysv
+	exit 0 ;;
+    M680?0:D-NIX:5.3:*)
+	echo m68k-diab-dnix
+	exit 0 ;;
+    M68*:*:R3V[567]*:*)
+	test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && echo i486-ncr-sysv4.3${OS_REL} && exit 0
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+          && echo i486-ncr-sysv4 && exit 0 ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit 0 ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    rs6000:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+	echo powerpc-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit 0 ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit 0 ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit 0 ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit 0 ;;
+    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                      # says <Richard.M.Bartel@xxxxxxxxxxxxxxxxx>
+        echo i586-unisys-sysv4
+        exit 0 ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@xxxxxxxxxxxxxx>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit 0 ;;
+    *:*:*:FTX*)
+	# From seanf@xxxxxxxxxxxxxxxx.
+	echo i860-stratus-sysv4
+	exit 0 ;;
+    *:VOS:*:*)
+	# From Paul.Green@xxxxxxxxxxx.
+	echo hppa1.1-stratus-vos
+	exit 0 ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit 0 ;;
+    news*:NEWS-OS:6*:*)
+	echo mips-sony-newsos6
+	exit 0 ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+	        echo mips-nec-sysv${UNAME_RELEASE}
+	else
+	        echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+        exit 0 ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit 0 ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit 0 ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit 0 ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    SX-6:SUPER-UX:*:*)
+	echo sx6-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit 0 ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit 0 ;;
+    *:Darwin:*:*)
+	case `uname -p` in
+	    *86) UNAME_PROCESSOR=i686 ;;
+	    powerpc) UNAME_PROCESSOR=powerpc ;;
+	esac
+	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	exit 0 ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = "x86"; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	exit 0 ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit 0 ;;
+    NSR-?:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit 0 ;;
+    *:NonStop-UX:*:*)
+	echo mips-compaq-nonstopux
+	exit 0 ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit 0 ;;
+    DS/*:UNIX_System_V:*:*)
+	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	exit 0 ;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "$cputype" = "386"; then
+	    UNAME_MACHINE=i386
+	else
+	    UNAME_MACHINE="$cputype"
+	fi
+	echo ${UNAME_MACHINE}-unknown-plan9
+	exit 0 ;;
+    *:TOPS-10:*:*)
+	echo pdp10-unknown-tops10
+	exit 0 ;;
+    *:TENEX:*:*)
+	echo pdp10-unknown-tenex
+	exit 0 ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	echo pdp10-dec-tops20
+	exit 0 ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	echo pdp10-xkl-tops20
+	exit 0 ;;
+    *:TOPS-20:*:*)
+	echo pdp10-unknown-tops20
+	exit 0 ;;
+    *:ITS:*:*)
+	echo pdp10-unknown-its
+	exit 0 ;;
+    SEI:*:*:SEIUX)
+        echo mips-sei-seiux${UNAME_RELEASE}
+	exit 0 ;;
+    *:DRAGONFLY:*:*)
+	echo ${UNAME_MACHINE}-unknown-dragonfly${UNAME_RELEASE}
+	exit 0 ;;
+esac
+
+#echo '(No uname command or uname output not recognized.)' 1>&2
+#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
+
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+          "4"
+#else
+	  ""
+#endif
+         ); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && $dummy && exit 0
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit 0 ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit 0 ;;
+    c34*)
+	echo c34-convex-bsd
+	exit 0 ;;
+    c38*)
+	echo c38-convex-bsd
+	exit 0 ;;
+    c4*)
+	echo c4-convex-bsd
+	exit 0 ;;
+    esac
+fi
+
+cat >&2 <<EOF
+$0: unable to guess system type
+
+This script, last modified $timestamp, has failed to recognize
+the operating system you are using. It is advised that you
+download the most up to date version of the config scripts from
+
+    ftp://ftp.gnu.org/pub/gnu/config/
+
+If the version you run ($0) is already up to date, please
+send the following data and any information you think might be
+pertinent to <config-patches@xxxxxxx> in order to provide the needed
+information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/performance/lmbench3/scripts/graph b/performance/lmbench3/scripts/graph
new file mode 100755
index 0000000..63cbefc
--- /dev/null
+++ b/performance/lmbench3/scripts/graph
@@ -0,0 +1,947 @@
+
+# $Id: graph 1.12 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $
+eval "exec perl -Ss $0 $@"
+	if 0;
+
+# A graphing preprocessor for GNU pic / troff package.
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+#
+# Input format is like that of Xgraph, i.e., sets of X Y pairs,
+# divided up by blank lines and titled with a "title.  Like so
+#
+#	1 1
+#	2 2
+#	"straight slope
+#
+#	4 4
+#	1 4
+#	"straight down
+#
+# Optional "quartile" data input format.
+# The drawing is ----- o  ---, with the lines being from y1..y2, y4..y5,
+# and the mark at y3.
+#
+#	x y1 y2 y3 y4 y5
+#	x y1 y2 y3 y4 y5
+#	x y1 y2 y3 y4 y5
+#
+# Optional input (superset of Xgraph) is like so:
+#
+#	%T Graph title in +4 point font
+#	%X X axis title and/or units in +2 point font
+#	%Y Y axis title and/or units in +2 point font
+#	%P Page title in +4 point font
+#	%fakemax-X <value>	force graph to be that big
+#	%fakemax-Y <value>	force graph to be that big
+#	%fakemin-X <value>	force graph to be that big
+#	%fakemin-Y <value>	force graph to be that big
+#
+# Options:
+#  -lm		implies -big -below -grid -close
+#  -rev		reverse X/Y data sense (and titles)
+#  -below	put data set titles below the graph rather than to the right
+#  -close	no extra space around the data
+#  -qline	connect the quartile center points
+#  -grid	grid :-)
+#  -halfgrid	Grid lines where the major ticks are
+#  -nobox	no box around whole graph
+#  -big		make the graph take the whole page
+#  -slide	make the graph fit in my slides
+#  -small	make the graph be small so you can do a lot of them.
+#  -notitle	no Title label
+#  -nolabels	no X/Y/Title labels
+#  -nodatal	no dataset labels
+#  -nomarks	no marks on the graphs.
+#  -nolines	no lines connecting the marks (don't use w/ -nomarks :-)
+#  -k		print (absolute) values larger than 1000 as (value/1000)K
+#  -grapheach	graph each data set separately
+#  -br_title	start a new graph at each title.
+#  -nospace	no .sp at top of picture
+#  -ts		time series, X axis is implied.
+#  -hist	produce a histogram graph
+#
+# Hacks :-)
+# -xk		multiply X input by 1024.
+# -xm		multiply X input by 1024*1024.
+# -logx		take the log base 2 of X input
+# -logy		take the log base 2 of Y input
+# -cut		add cut marks so that image croppers dont crop too close
+#
+# Much thanks to James Clark for providing such a nice replacement for
+# the Unix troff package.  Thanks to the Xgraph folks for providing
+# inspiration.  Thanks to Declan Murphy for math :-)
+# Thanks to noone for floating point numbers, they suck dog doo.
+# There are lots of hacks in here to deal with rounding errors.
+#
+# TODO:
+#	All of the option parsing done manually.
+#	A filter option to print ranges of the data?
+#	A way to do each data set in it's own graph.
+#	All of the other xgraph options?
+#	For Adam, that butthead, an option to sort the labels such that they
+#	are in the same order as the right endpoints of the data sets.
+
+&init;
+&autosize;
+&pic;
+exit;
+
+# init - slurp in the data and apply any transformations.
+sub init
+{
+	# Lint for the options.
+	$qline = $ts = $close = $nolines = $thk1 = $thk2 = $k = $notitle
+	= $thk1_5 = $xm = $grid = $nospace = $lm = $hist = 0 if 0;
+
+	if ($grapheach) { $grapheach = 1; $cut = 0; } else { $grapheach = 0; }
+	if ($halfgrid) { $halfgrid = 1; } else { $halfgrid = 0; }
+	if ($hist) { $nobox = 1; $nolabels = 1; $close = 1; $nolines = 1; }
+	if ($lm) { $big = $below = $grid = $close = 1; }
+
+	# Accept %options=value on the command line.
+	while ($ARGV[0] =~ /^%/) {
+		$_ = $ARGV[0];
+		s/=/ /;
+		push(@lines, "$_\n");
+		shift(@ARGV);
+	}
+
+	# OK, sometimes we get
+	#	%T title
+	#	%X X axis, etc.
+	#	
+	#	"data set 1
+	#
+	# And this messes up the numbering later on.  So we carefully dump the
+	# whitespace between the control and data.
+	while (<>) {
+		last if /^\s*$/;
+		push(@lines, $_);
+		last if /^"/;
+		last if /^\d/;
+	}
+	push(@lines, <>);
+	$fake = "";
+	$items = 0;
+	$stat_sum = 0;
+	$min = 1.7E+308;
+	$max = 2.2E-308;
+	foreach (@lines) {
+		if (/^"?%fake/) {
+			$fake = $_;
+			s/"?%fakemax-//;
+			s/"?%fakemin-//;
+			@_ = split;
+			$_ = "$_[1] $_[1]";
+		} elsif (/^%hist\s/) {
+			split;
+			shift(@_);
+			($hist_bsize, $hist_low, $hist_high) = @_;
+			next;
+		} else {
+			next if /^\s*["%#]/;
+			next if /^\s*$/;
+		}
+		if ($ts) {
+			$_ = "$items $_";
+		}
+		$items++;
+		@_ = split;
+		if ($xk) {
+			$_[0] = $_[0] * 1024;
+		} elsif ($xm) {
+			$_[0] = $_[0] * 1024 * 1024;
+		}
+		if ($logx) {
+			$_[0] = &logbase(2, $_[0]);
+		}
+		if ($yk) {
+			$_[1] = $_[1] * 1024;
+		} elsif ($ym) {
+			$_[1] = $_[1] * 1024 * 1024;
+		}
+		if ($logy) {
+			$_[1] = &logbase(2, $_[1]);
+		}
+		if ($rev) {
+			$_ = "$_[1] $_[0]";
+			$y = $_[0];
+		} else {
+			$_ = "$_[0] $_[1]";
+			$y = $_[1];
+		}
+		$stat_sum += $y;
+		$max = $y if ($y > $max);
+		$min = $y if ($y < $min);
+		push(@y, $y);
+		if ($fake =~ /[XY]/) {
+			# XXX - reverse?  What should it do?
+			if ($fake =~ /fakemax-X/) {
+				$fakemax_X = $_[0];
+			} elsif ($fake =~ /fakemax-Y/) {
+				$fakemax_Y = $_[1];
+			} elsif ($fake =~ /fakemin-X/) {
+				$fakemin_X = $_[0];
+			} elsif ($fake =~ /fakemin-Y/) {
+				$fakemin_Y = $_[1];
+			}
+			$_ = $fake;
+			$fake = "";
+		}
+	}
+
+	# Do some statistics.
+	@s = sort(@y);
+	if ($items & 1) {	
+		$stat_median = $s[($items + 1)/2];
+	} else {
+		$i = $items / 2;
+		$stat_median = ($s[$i] + $s[$i+1]) / 2;
+	}
+	$stat_avg = $stat_sum/$items;
+	$stat_avgdev = $stat_var = 0;
+	# $stat_skew = $stat_curt = 0;
+	foreach $_ (@lines) {
+		next if /^\s*["#%]/;
+		next if /^\s*$/;
+		@_ = split;
+		$stat_var += ($_[1] - $stat_median) ** 2;
+		$tmp = $_[1] - $stat_median;
+		$stat_avgdev += $tmp > 0 ? $tmp : -$tmp;
+	}
+	$stat_var /= $items - 1;
+	$stat_stddev = sqrt($stat_var);
+	$stat_avgdev /= $items;
+	if ($ts) {
+		printf STDERR "N=$items min=$min max=$max med=%.2f avg=%.2f stddev=%.2f avgdev=%.2f\n",
+		    $stat_median, $stat_avg, $stat_stddev, $stat_avgdev;
+	}
+
+	# Diddle this to create different marks.
+	@marks = (
+	    '[ "\s+2\(bu\s0" ]',
+	    '[ "\(sq" ]',
+	    '[ "\(*D" ]',
+	    '[ "\s+2\(pl\s0" ]',
+	    '[ "\(*F" ]',
+	    '[ "\s+2\fB\(mu\fP\s0" ]',
+	    '[ circle rad .035 fill 0 ]',
+	    '[ box ht .07 wid .07 fill 1 ]',
+	    '[ "\(dd" ]',
+	    );
+	$nmarks = $#marks + 1;
+	$nomark = '[ box invis ht .05 wid .05 ]';
+
+	$first_title = 1;
+
+	if ($nospace) {
+		$graphspace = "0";
+	} elsif ($small) {
+		$graphspace = ".15i";
+	} elsif ($medium) {
+		$graphspace = ".20i";
+	} else { 
+		$graphspace = ".25i";
+	}
+
+	if ($small) {
+		$marks[0] = '[ circle rad .007 fill 1 ]';
+		$PS = 10;
+		$ft = "B";
+		$tick = .1;
+	} elsif ($medium) {
+		$PS = 11;
+		$ft = "HB";
+		$tick = .1;
+	} elsif ($slide) {
+		$ft = "HB";
+		$PS = 11;
+		$tick = .15;
+	} else {
+		$ft = "CB";
+		$PS = 12;
+		$tick = .15;
+	}
+	$thk = .75;
+	$thk = 1 if $thk1;
+	$thk = 1.5 if $thk1_5;
+	$thk = 2 if $thk2;
+	$thk = .2 if $thk_2;
+	$gthk = .25;
+	$gthk = 1 if $gthk1;
+	$gthk = .75 if $gthk_75;
+	$gthk = .5 if $gthk_5;
+	$lineinvis = $nolines ? "invis" : "";
+}
+
+# Calculate min/max to autosize the graph.
+sub autosize
+{
+	foreach $_ (@lines) {
+		next if /^\s*["#%]/;
+		next if /^\s*$/;
+		@_ = split;
+		if ($#_ == 1) {
+			$Ymax = $Ymin = $_[1];
+		} elsif ($#_ == 5) {	# Quartile plot
+			$Ymax = $Ymin = $_[1];
+			for ($i = 2; $i <= 5; ++$i) {
+				$Ymax = $_[$i] if ($Ymax < $_[$i]);
+				$Ymin = $_[$i] if ($Ymin > $_[$i]);
+			}
+		} else {
+			die "Data format error: $_\n";
+		}
+		if (!defined $xmin) {
+			$xmin = $_[0];
+			$xmax = $_[0];
+			$ymin = $Ymin;
+			$ymax = $Ymax;
+		}
+		else {
+			$xmin = $_[0] if ($xmin > $_[0]);
+			$xmax = $_[0] if ($xmax < $_[0]);
+			$ymin = $Ymin if ($ymin > $Ymin);
+			$ymax = $Ymax if ($ymax < $Ymax);
+		}
+	}
+
+	# Handle fake max
+	if (defined($fakemax_X) && $fakemax_X > $xmax) {
+		$xmax = $fakemax_X;
+	}
+	if (defined($fakemax_Y) && $fakemax_Y > $ymax) {
+		$ymax = $fakemax_Y;
+	}
+	if (defined($fakemin_X) && $fakemin_X < $xmin) {
+		$xmin = $fakemin_X;
+	}
+	if (defined($fakemin_Y) && $fakemin_Y < $ymin) {
+		$ymin = $fakemin_Y;
+	}
+	if ($hist) {
+		$xmax += $hist_bsize;
+	}
+	warn "n=$items xmin=$xmin xmax=$xmax ymin=$ymin ymax=$ymax\n" if $debug;
+	($xlower, $xupper, $xtick) = &tick($xmin, $xmax, $logx ? 2 : 10);
+	($ylower, $yupper, $ytick) = &tick($ymin, $ymax, $logy ? 2 : 10);
+	if ($ymax + $ytick*.45 < $yupper) {
+		$yupper -= $ytick;
+		$ypartial = $ymax - $yupper;
+	} else {
+		$ypartial = 0;
+	}
+	$xn = int(.9 + ($xupper - $xlower) / $xtick);
+	$yn = int(.9 + ($yupper - $ylower) / $ytick);
+	$xlower = sprintf("%.6f", $xlower);	# really ugly cast
+	$xupper = sprintf("%.6f", $xupper);	# really ugly cast
+	$xtick = sprintf("%.6f", $xtick);	# really ugly cast
+	$xn = sprintf("%.0f", $xn);		# really ugly cast
+	$ylower = sprintf("%.6f", $ylower);	# really ugly cast
+	$yupper = sprintf("%.6f", $yupper);	# really ugly cast
+	$ytick = sprintf("%.6f", $ytick);	# really ugly cast
+	$yn = sprintf("%.0f", $yn);		# really ugly cast
+}
+
+# Since I had to go rethink it, here's the explanation:
+#
+# log base e 10 = X implies e**x = 10
+# e ** (v * x) = (e ** x) ** v
+# since e ** x == 10, that implies e ** (v * x) is 10 ** v
+# Capeesh?
+sub expbase
+{
+	local($base, $val) = @_;
+
+	exp($val * log($base));
+}
+
+sub logbase
+{
+	local($base, $val) = @_;
+
+	if ($val == 0) {
+		return 0;
+	}
+	if ($val < 0) {
+		die "Input: $_: can't take log of negative value: $val\n";
+	}
+	log($val) / log($base);
+}
+
+# Figure out the tick marks.
+# XXX - the log stuff is not quite right.
+sub tick
+{
+	local($min, $max, $base) = @_;
+	local($delta, $adj, $lower, $upper, $tick);
+
+	$delta = $max - $min;
+	$tick = int(&logbase(10, $delta));
+	$tick = &expbase(10, $tick - 1);
+	if ($delta / $tick > 10) {
+		if ($base == 10) {
+			if (($delta / (2 * $tick)) > 15) {
+				$adj = 10;
+			} elsif (($delta / (2 * $tick)) > 10) {
+				$adj = 5;
+			} else {
+				$adj = 2;
+			}
+		} else {
+			$adj = 2;
+		}
+	} else {
+		$adj = 1;
+	}
+	$tick *= $adj;
+
+	# Go figure out the endpoints.  This is O(log10(n)) where N is the
+	# number of ticks from 0 to the min.
+	$lower = 0;
+	for ($i = 10e99; $i > 0; $i = int($i/$base)) {
+		$fudge = $i * $tick;
+		$bound = $min + $fudge * .00001;
+
+		# Sometimes it's too big
+		while ($lower > $bound) {
+			$lower -= $fudge;
+		}
+
+		# Sometimes it's too small
+		while (($lower + $fudge) <= $bound) {
+			$lower += $fudge;
+		}
+	}
+
+	if ($base == 2) {
+		if ($tick < 1) {
+			$tick = 1;
+		} else {
+			$tick = sprintf("%.0f", $tick);
+		}
+		$lower = sprintf("%.0f", $lower);
+	}
+	for ($upper = $lower; $upper < $max - $tick * .00001; $upper += $tick) {
+	}
+	if ($base == 2) {
+		$upper = sprintf("%.0f", $upper);
+	}
+	# If you don't like your end points on the border then do this.
+	unless ($close) {
+		if ($min - $lower < .1 * $tick) {
+			$lower -= $tick;
+		}
+		if ($max - $upper < .1 * $tick) {
+			$upper += $tick;
+		}
+	}
+	($lower, $upper, $tick);
+}
+
+# Spit out the pic stuff.
+# The idea here is to spit the variables and let pic do most of the math.
+# This allows tweaking of the output by hand.
+sub pic
+{
+	if ($k) {
+		$print = 'sprintf("%.0fK", j/1000)';
+	} else {
+		$print = 'sprintf("%.0f", j)';
+	}
+	if ($grid || $halfgrid) {
+		$nogrid = "dotted";
+	} else {
+		$nogrid = "invis";
+	}
+	if ($nobox) {
+		$nobox = "invis";
+	}
+	$log_x = $logx ? "logx = 1" : "logx = 0";
+	$log_y = $logy ? "logy = 1" : "logy = 0";
+	if ($big) {
+		print ".sp .5i\n.po .5i\n";
+		if ($below) {
+			$ysize = 7;
+		} else {
+			$ysize = 9;
+		}
+		if ($nodatal) {
+			$xsize = 7;
+		} else {
+			$xsize = 6;
+		}
+	} elsif ($small) {
+		$ysize = 1.75;
+		$xsize = 1.75;
+	} elsif ($medium) {
+		print ".po .52i\n";
+		$ysize = 1.9;
+		$xsize = 2.05;
+	} elsif ($slide) {
+		print ".sp .35i\n";
+		$xsize = 4.5;
+		$ysize = 4.1;
+	} else {
+		print ".sp 1i\n";
+		$ysize = 5;
+		$xsize = 5;
+	}
+	&graph;
+
+	# Mark the data points
+	@datasets = ();
+	for ($sub = 0; $sub <= $#lines; $sub++) {
+		$_ = $lines[$sub];
+		if (/^\s*$/) {		# end of data set
+			&data($set++);
+			if ($grapheach) {
+				&titles;
+				if ($small) {
+					if ($set == 4) {
+						print ".sp -11i\n";
+						print ".po 3.5i\n";
+					} elsif ($set == 8) {
+						print ".sp -11i\n";
+						print ".po 6i\n";
+					}
+				} else {	# ???
+					if ($set == 4) {
+						print ".sp -11i\n";
+						print ".po 3.15i\n";
+					} elsif ($set == 8) {
+						print ".sp -11i\n";
+						print ".po 5.8i\n";
+					}
+				}
+
+				if ($sub < $#lines) {
+					&graph;
+				}
+			}
+			next;
+		}
+		if (/^"?%fake/) {	# Skip this
+			next;
+		}
+		if (/^"?%T\s+/) {	# Title specification
+			# Spit out the last graph at next title.
+			if ($br_title && $graphs++ > 0) {
+				&titles;
+				if ($graphs == 5) {
+					print ".sp -11i\n";
+					print ".po 3.5i\n";
+				} elsif ($graphs == 9) {
+					print ".sp -11i\n";
+					print ".po 6i\n";
+				}
+				&graph;
+			}
+			s/^"?%T\s+//;
+			chop;
+			$Gtitle = $_;
+			next;
+		}
+		if (/^"?%X\s+/) {	# X axis title specification
+			s/^"?%X\s+//;
+			chop;
+			$Xtitle = $_;
+			next;
+		}
+		if (/^"?%Y\s+/) {	# Y axis title specification
+			s/^"?%Y\s+//;
+			chop;
+			$Ytitle = $_;
+			next;
+		}
+		if (/^"?%P\s+/) {	# Page title specification
+			s/^"?%P\s+//;
+			chop;
+			$Ptitle = $_;
+			warn "Pt: $Ptitle\n";
+			next;
+		}
+		if (/^"/) {		# Data set title
+			s/^"//;
+			chop;
+			$dataset = $_;
+			push(@datasets, "$dataset");
+			next;
+		}
+		push(@data, $_);
+	}
+	unless ($grapheach) {
+		&data($set++);
+		&titles;
+	}
+	if (defined($Ptitle)) {
+		print ".po 1i\n.sp -12i\n.ps 20\n.ce 1\n";
+		print "$Ptitle\n";
+		print ".po 1i\n.sp -12i\n.sp 10.4i\n.ps 20\n.ce 1\n";
+		print "$Ptitle\n";
+	}
+}
+
+# Draw the titles and finish this graph.
+sub titles
+{
+	# Do X/Y titles, if any.
+	unless ($nolabels) {
+		$Xtitle = defined($Xtitle) ? $Xtitle : "X";
+		$Ytitle = defined($Ytitle) ? $Ytitle : "Y";
+		if ($rev && $first_title) {
+			$tmp = $Xtitle;
+			$Xtitle = $Ytitle;
+			$Ytitle = $tmp;
+		}
+		print "\n# Xaxis title.\n";
+		print "\"\\s+4$Xtitle\\s0\" rjust at O.se - (0, .6)\n";
+	
+		print "\n# Yaxis title ($Ytitle)\n.ps +2\n";
+		$tmp = $Ytitle;
+		while (length($tmp) > 0) {
+			$tmp =~ s/(.)//;
+	    		print "\"$1\" ";
+		}
+		print "\\\n    at O.w - (.75, 0)\n.ps\n";
+
+	}
+
+	# Do the graph title, if any.
+	$Gtitle = defined($Gtitle) ? $Gtitle : "Pic Graph";
+	if ($grapheach) {
+		$Gtitle = $datasets[$#datasets];
+		print "\n# Graph title.\n";
+		print "\"$Gtitle\" at O.n + (0, .1)\n";
+	}
+
+	if ($br_title) {
+		print "\n# Graph title.\n";
+		print "\"\\s+2$Gtitle\\s0\" at O.n + (0, .1)\n";
+	}
+
+	unless ($nolabels || $notitle) {
+		print "\n# Graph title.\n";
+		if ($big) {
+			print "\"\\s+8$Gtitle\\s0\" at O.n + (0, .3)\n";
+		} else {
+			print "\"\\s+4$Gtitle\\s0\" at O.n + (0, .3)\n";
+		}
+	}
+
+	if ($cut) {
+		$cutthick = .75;
+		print "\n# Cut marks\n";
+		print "move to O.n + 0,.65; line thick $cutthick right .1\n";
+		print "move to O.w - 1,0; line thick $cutthick down .1\n";
+		print "move to O.e + .35,0; line thick $cutthick down .1\n";
+	}
+
+	# Do the dataset titles.
+	$i = 0;
+	unless ($nodatal) {
+		print "\n# Title.\n";
+		if (!$grapheach) {
+			print ".ft R\n" if ($slide);
+			for ( ; $i <= $#datasets; $i++) {
+				print $marks[$i % $nmarks];
+				if ($below) {
+					print " at O.sw - (0, .75 + $i * vs)\n";
+				} else {
+					print " at O.ne + (.25, - $i * vs)\n";
+				}
+				print 
+			    "\"$datasets[$i]\" ljust at last [].e + (.1, 0)\n";
+		    	}
+			if ($cut) {
+				print "\nmove to O.s - 0,.75 + $i * vs\n";
+				print "line thick $cutthick right .1\n";
+			}
+			print ".ft\n" if ($slide);
+		}
+	}
+
+	# Finish up.
+	print "]\n.ft\n.ps\n.PE\n";
+
+	# Do the statistics
+	if ($stats) {
+		$i++;
+		$min = sprintf "%.4f", $min;
+		$max = sprintf "%.4f", $max;
+		$stat_median = sprintf "%.4f", $stat_median;
+		$stat_avg = sprintf "%.4f", $stat_avg;
+		$stat_stddev = sprintf "%.4f", $stat_stddev;
+		$stat_avgdev = sprintf "%.4f", $stat_avgdev;
+		print <<EOF;
+.ps 12
+.vs 14
+.ft CB
+.po +.7i
+.TS
+c s
+l r.
+Statistics
+=
+min	$min
+max	$max
+median	$stat_median
+average	$stat_avg
+stddev	$stat_stddev
+avgdev	$stat_avgdev
+.TE
+.po -.7i
+.ft
+.ps
+.vs
+EOF
+	}
+
+	$first_title = 0;
+}
+
+sub graph
+{
+	if ($hist) { $hist = 1; } else { $hist = 0; }
+	print ".sp ${graphspace}\n";
+	print <<EOF;
+.PS
+.ps $PS
+.vs 11
+.ft $ft
+[
+# Variables, tweak these.
+	xtick = $xtick			# width of an X tick
+	xlower = $xlower			# where the xtick start
+	xupper = $xupper			# upper range of graph
+	xn = $xn					# number of ticks to do
+	ytick = $ytick			# width of an Y tick
+	ylower = $ylower			# where the ytick start
+	yupper = $yupper			# upper range of graph
+	yn = $yn					# number of ticks to do
+	xsize = $xsize				# width of the graph
+	ysize = $ysize				# height of the graph
+	yscale = ysize / (yupper - ylower)	# scale data to paper
+	xscale = xsize / (xupper - xlower)	# scale data to paper
+	tick = $tick				# distance towards numbers
+	gthk = $gthk				# thickness of grid lines
+	thk = $thk				# thickness of data lines
+	grapheach = $grapheach			# doing lotso little ones?
+	halfgrid = $halfgrid			# fewer grid lines
+	qthk = 2.0				# thickness of quartile lines
+	vs = .15				# works for 10 point fonts
+	hist = $hist				# histogram
+	ypartial = $ypartial				# Y spillerover
+	$log_x				# 1 if x data is log base 2
+	$log_y				# 1 if y data is log base 2
+
+# Draw the graph borders and tick marks
+	O:	box $nobox thick 2 ht ysize wid xsize
+	if (hist) then {
+		# The box was invisible, draw the three sides
+		# The partial part i sbecause we are just too big.
+		line thick 2 from O.sw to O.se
+		line thick 2 from O.sw to O.nw + 0,ypartial*yscale
+		line thick 2 from O.se to O.ne + 0,ypartial*yscale
+		xgridlen = xsize + tick/2
+	} else {
+		xgridlen = xsize
+	}
+	if (ysize < 2.5) then {
+		ysp = -.15
+		xsp = -.2
+		tick = tick * .75
+	} else {
+		ysp = -.2
+		xsp = -.25 
+	}
+	j = ylower
+	t = tick * .5
+	for i = 0 to yn by 1 do {
+		ys = j - ylower
+		g = ys * yscale
+		# Draw the ticks to the numbers on the Y axis
+		line thick gthk from O.sw + (-tick, g) to O.sw + (0, g)
+		if (hist) then {
+			line thick gthk from O.se + (tick, g) to O.se + (0, g)
+		}
+		# Grid line across at same level as number ticks
+		line $nogrid thick gthk from O.sw + 0,g to O.sw + xsize,g
+		if (i < yn) then {
+			y2 = (ys + (ytick / 2)) * yscale
+			if (!halfgrid) then {
+				# Grid line across between number ticks
+				line $nogrid thick gthk from \\
+				    O.sw + (-t, y2) to O.sw + (xgridlen, y2)
+			}
+		}
+		if (logy == 1) then {
+			tmp = 2 ^ j;
+			if (tmp >= 1024*1024) then {
+				tmp = tmp / (1024*1024)
+				sprintf("%.0fM", tmp) at O.sw + ysp,g-.02
+			} else { if (tmp >= 1024) then {
+				tmp = tmp / 1024
+				sprintf("%.0fK", tmp) rjust at O.sw + ysp,g-.02
+			} else {
+				sprintf("%.0f", tmp) rjust at O.sw + ysp,g-.02
+			}}
+		} else { if (yupper - ylower > 999) then {
+			$print rjust at O.sw + ysp, g - .02
+			if (hist) then { $print ljust at O.se + -ysp,g-.02 }
+		} else { if (yupper - ylower > 10) then {
+			sprintf("%.0f", j) rjust at O.sw + ysp, g - .02
+			if (hist) then { 
+				sprintf("%.0f", j) ljust at O.se + -ysp,g-.02
+			}
+		} else { if (yupper - ylower > 1) then {
+			sprintf("%.1f", j) rjust at O.sw + ysp, g - .02
+			sprintf("%.1f", j) rjust at O.sw + ysp, g - .02
+		} else { if (yupper - ylower > .1) then {
+			sprintf("%.2f", j) rjust at O.sw + ysp, g - .02
+			if (hist) then {
+				sprintf("%.2f", j) ljust at O.se + -ysp,g-.02
+			}
+		} else {
+			sprintf("%.3f", j) rjust at O.sw + ysp, g - .02
+			if (hist) then {
+				sprintf("%.3f", j) ljust at O.se + -ysp,g-.02
+			}
+		}}}}}
+		j = j + ytick
+	}
+	j = xlower
+	even = 0
+	for i = 0 to xn by 1 do {
+		even = !even
+		doit = !grapheach || xn > 9 || even
+		xs = j - xlower
+		g = xs * xscale
+		line thick gthk from O.sw + (g, -tick) to O.sw + (g, 0)
+		if (!hist) then {
+			line $nogrid thick gthk from O.sw + g,0 to O.sw + g,ysize
+		}
+		if (i < xn) then {
+			x2 = (xs + (xtick / 2)) * xscale
+			if (!halfgrid && !hist) then {
+				line $nogrid thick gthk from O.sw+x2,-t to O.sw+x2,ysize
+			}
+		}
+		if (logx == 1) then {
+			tmp = 2 ^ j;
+			if (tmp >= 1024*1024) then {
+				tmp = tmp / (1024*1024)
+				if (doit) then {
+					sprintf("%.0fM", tmp) at O.sw + g,xsp
+				}
+			} else { if (tmp >= 1024) then {
+				tmp = tmp / 1024
+				if (doit) then {
+					sprintf("%.0fK", tmp) at O.sw + g,xsp
+				}
+			} else {
+				if (doit) then {
+					sprintf("%.0f", tmp) at O.sw + g,xsp
+				}
+			}}
+		} else { if (xupper - xlower > 999) then {
+			$print at O.sw + g, xsp
+		} else { if (xupper - xlower > 10) then {
+			sprintf("%.0f", j) at O.sw + g, xsp
+		} else { if (xupper - xlower > 1) then {
+			sprintf("%.1f", j) at O.sw + g, xsp
+		} else { if (xupper - xlower > .1) then {
+			sprintf("%.2f", j) at O.sw + g, xsp
+		} else {
+			sprintf("%.3f", j) at O.sw + g, xsp
+		}}}}}
+		j = j + xtick
+	}
+EOF
+	# Add some statistics.
+	if ($stats) {
+		print "line from O.sw + 0,(yscale * ($stat_avg - $ylower)) " .
+		    "to O.se + 0,(yscale * ($stat_avg - $ylower))\n";
+		print "\"average\" at last line.e + .2,0 ljust\n";
+		print "line from O.sw + 0,(yscale * ($stat_median - $ylower)) " .
+		    "to O.se + 0,(yscale * ($stat_median - $ylower))\n";
+		print "\"median\" at last line.e + .2,0 ljust\n";
+		$tmp = $stat_median + $stat_avgdev;
+		print "line from O.sw + 0,(yscale * ($tmp - $ylower)) " .
+		    "to O.se + 0,(yscale * ($tmp - $ylower))\n";
+		print "\"+ avgdev\" at last line.e + .2,0 ljust\n";
+		$tmp = $stat_median - $stat_avgdev;
+		print "line from O.sw + 0,(yscale * ($tmp - $ylower)) " .
+		    "to O.se + 0,(yscale * ($tmp - $ylower))\n";
+		print "\"- avgdev\" at last line.e + .2,0 ljust\n";
+	}
+}
+
+sub data
+{
+	local($mark) = int(int($_[0]) % int($nmarks));
+
+	print "\n# DATASET: $dataset, MARK $mark\n";
+	$first = 1;
+	foreach $d (@data) {
+		next if $d =~ /^\s*"/;
+		next if $d =~ /^\s*#/;
+		next if $d =~ /^\s*$/;
+		@_ = split(/[ \t\n]+/, $d);
+		$x = sprintf("%.6g", $_[0]);
+		$y = sprintf("%.6g", $_[1]);
+		if ($#_ == 1) {
+			if ($hist) {
+				print "box fill .25 " .
+				    "ht yscale * ($y - ylower) " .
+				    "wid $hist_bsize * xscale " .
+				    "with .sw at O.sw + " .
+				    "xscale * ($x - xlower),0\n";
+			} elsif ($nomarks && ($grapheach || !$first)) {
+				print $nomark . " at O.sw + \\\n\t" . 
+				    "(xscale * ($x - xlower), " . 
+				    "yscale * ($y - ylower))\n";
+			} else {
+				print $marks[$mark] . 
+				    " at O.sw + \\\n\t" . 
+				    "(xscale * ($x - xlower), " . 
+				    "yscale * ($y - ylower))\n";
+			}
+			if (!$hist && $first != 1) {
+				print "line $lineinvis thick thk from " .
+				    "2nd last [].c to last [].c\n";
+			}
+			$first = 0;
+		} elsif ($#_ == 5) {	# Quartile graph
+			# Draw the lower line
+			print "x = xscale * ($_[0] - xlower)\n";
+			print "    line thick qthk from \\\n\t" .
+			    "O.sw + x, yscale * ($_[1] - ylower) to\\\n\t" .
+			    "O.sw + x, yscale * ($_[2] - ylower)\n";
+			# Draw the mark
+			print "    $marks[$mark]" . " at O.sw + \\\n\t" . 
+			    "x, yscale * ($_[3] - ylower)\n";
+			# Draw the upper line
+			print "    line thick qthk from \\\n\t" .
+			    "O.sw + x, yscale * ($_[4] - ylower) to\\\n\t" .
+			    "O.sw + x, yscale * ($_[5] - ylower)\n";
+			# Connect the lines?
+			if ($qline) {
+				if ($first != 1) {
+					print "line thick thk from " .
+					    "2nd last [].c to last [].c\n";
+				}
+			}
+			$first = 0;
+		}
+	}
+	# Put a mark on the end point
+	if ($nomarks && !$nodatal && !$first && !$grapheach) {
+		print $marks[$mark] . 
+		    " at O.sw + \\\n\t" . 
+		    "(xscale * ($x - xlower), " . 
+		    "yscale * ($y - ylower))\n";
+	}
+	@data = ();
+}
diff --git a/performance/lmbench3/scripts/html-list b/performance/lmbench3/scripts/html-list
new file mode 100755
index 0000000..b91572d
--- /dev/null
+++ b/performance/lmbench3/scripts/html-list
@@ -0,0 +1,123 @@
+
+# Take the list of files and turn them into an html file that points
+# at their context & mem latency GIFs.
+#
+# Usage: html-list file file file....
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1995 Larry McVoy.  GPLed software.
+# $Id: html-list 1.3 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+open(H, ">HTML/specific.html");
+print H <<EOF;
+<title>LMBENCH System Results</title>
+<h1>LMBENCH System Results</h1>
+<h2><a href=summary>Summary of results</a></h2>
+<hr>
+EOF
+
+# The order that is passed in is the order of the generated 
+# graphs so save that.
+$val = 0;
+foreach $file (@ARGV) {
+	$number{$file} = ++$val;
+}
+
+# Now sort them so we can group by OS
+@ARGV = sort(@ARGV);
+
+# Figure out the different OS
+foreach $file (@ARGV) {
+	($os = $file) =~ s|/.*||;
+	push(@os, $os);
+	$done{$os} = 0;
+}
+
+foreach $os (@os) {
+	next if $done{$os};
+	$done{$os} = 1;
+	# Print out an OS specific heading
+	print H "<hr><h2>Results from $os</h2><p>\n";
+
+	for ($i = 0; $i <= $#os; $i++) {
+		$file = $ARGV[$i];
+		next unless $file =~ /$os/;
+		open(F, $file);
+		$_ = <F>;
+		close(F);
+		next unless /lmbench1.[01]/;
+		chop;
+		$title = $_;
+		#s/.lmbench1.? results for //;
+		($sys = $file) =~ s|.*/||;
+		if ($i > 0) {
+			($prev_sys = $ARGV[$i - 1]) =~ s|.*/||;
+		}
+		if ($i < $#os) {
+			($next_sys = $ARGV[$i + 1]) =~ s|.*/||;
+		}
+		print H <<EOF;
+<h3>Dataset: $sys</h3>
+<h4>$title</h4>
+<a href="${sys}-ctx.html">Context switch details</a>,
+<a href="${sys}-bwmem.html">memory bandwidths</a>,
+<a href="${sys}-bwfile.html">file reread vs. memory bandwidths</a>,
+and
+<a href="${sys}-mem.html">memory latencies</a>.
+EOF
+
+		# Create the files referencing the data GIFs
+		$N = sprintf("%02d", $number{$file});
+		$prev = $next = "";
+		%label = ('ctx', 'context switching',
+		    	   'mem', 'memory latency',
+			   'bwmem', 'memory bandwidth',
+			   'bwfile', 'file reread bandwidth');
+		%doc = ('ctx', 'lat_ctx.8.html',
+		    	   'mem', 'lat_mem_rd.8.html',
+			   'bwmem', 'bw_mem.8.html',
+			   'bwfile', 'bw_file_rd.8.html');
+		$back = "<img align=middle src=\"../gifs/arrows/back.gif\">";
+		$forward = "<img align=middle src=\"../gifs/arrows/forward.gif\">";
+		for $what ('ctx', 'mem', 'bwmem', 'bwfile') {
+			for $scale ('', '-unscaled') {
+				open(S, ">HTML/${sys}-${what}${scale}.html");
+				if ($scale eq '') {
+					$notscale = "-unscaled";
+					$lab = "";
+					$Lab = "Unscaled ";
+				} else {
+					$notscale = "";
+					$lab = "scaled ";
+					$Lab = "Scaled ";
+				}
+				$prev =
+				    "<a href=${prev_sys}-${what}${scale}.html>
+				    Previous ${lab}$label{$what} result</a><p>"
+				    if $i > 0;
+				$next =
+				    "<a href=${next_sys}-${what}.html>
+				    Next ${lab}$label{$what} result</a><p>"
+				    if $i < $#os;
+				print S<<EOF;
+<h4>$title</h4>
+<a href=../$doc{$what}>Information on this benchmark</a> (Not up to date)
+<p><IMG SRC="${what}${scale}$N.gif">\n<p>
+<a href=../lmbench.html>
+<img align=middle src="../gifs/arrows/b_arrow.gif">LMBENCH table of contents</a>
+<a href=specific.html>
+<img align=middle src=\"../gifs/graph.gif\">System results table of contents</a>
+<p>
+$next
+$prev
+<a href=${sys}-${what}${notscale}.html>
+${Lab}$label{$what} results for this system</a>
+EOF
+			}
+		}
+
+	}
+}
+exit 0;
diff --git a/performance/lmbench3/scripts/html-man b/performance/lmbench3/scripts/html-man
new file mode 100755
index 0000000..8324a30
--- /dev/null
+++ b/performance/lmbench3/scripts/html-man
@@ -0,0 +1,83 @@
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+# Take a man tree and make an html tree out of it
+#
+# Derived from Donners man2html script
+
+from=/usr/man
+to=/u/eo/repository/system/unix/man
+
+function disambiguate
+{
+newbase=${1}
+newname="${newbase}.1"
+dis=2
+while [ -a "${newname}" ]
+   do
+      newname=$newbase"."$dis
+      dis=$(expr $dis + 1)
+      done
+}
+
+while ($ARGV[0] =~ /^-/) {
+	if ($ARGV[0] eq "-f") {
+		shift(@ARGV);
+		$from = shift(@ARGV);
+	}
+	if ($ARGV[0] eq "-t") {
+		shift(@ARGV);
+		$to = shift(@ARGV);
+	}
+}
+
+open(FD, "find $from -name '*.[0-9ln]' -print |");
+while ($find = <FD>) {
+}
+
+if [ ! "${indexonly}" ]
+   then
+      print "Processing the man pages ..."
+      for i in man${sections}/*
+	 do
+	    if [ "$verbose" ]
+	       then
+		  print $i
+	       fi
+	    # n=${i%.*}
+	    name=${to}/${i}
+	    if [ -a "${name}" ]
+	       then
+		  oldname=$name
+		  disambiguate $name
+		  name=$newname
+		  print "Collision - ${oldname} will be stored as ${name}"
+	       fi
+	    eqn $i | tbl | nroff -man | rman -f HTML | sed -e "s/MS_LOCAL_HOST/${localeo}/g" > ${name}
+	    done
+   fi
+
+print "Building the index.html files ..."
+cd $to
+for i in man${sections}
+   do
+      if [ "$verbose" ]
+	 then
+	    print $i
+	 fi
+      cd $i
+      rm -f index.html
+      echo '<ul>' > ../new.html
+      for j in *
+	 do
+	    if [ "$verbose" ]
+	       then
+		  print -n "$j "
+	       fi
+	    print
+	    print "<li> <a href=$j>$j</a>" >> ../new.html
+	    done
+      echo '</ul>' >> ../new.html
+      mv ../new.html index.html
+      cd ..
+      done
diff --git a/performance/lmbench3/scripts/info b/performance/lmbench3/scripts/info
new file mode 100755
index 0000000..e6860ed
--- /dev/null
+++ b/performance/lmbench3/scripts/info
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+UNAME=`uname -n 2>/dev/null`
+if [ X$UNAME = X ]
+then	echo INFO
+else	echo INFO.$UNAME
+fi
diff --git a/performance/lmbench3/scripts/info-template b/performance/lmbench3/scripts/info-template
new file mode 100755
index 0000000..91daa8f
--- /dev/null
+++ b/performance/lmbench3/scripts/info-template
@@ -0,0 +1,42 @@
+Thanks very much for filling this out.   The system will save it across
+runs so that you don't have to do it again unless you change what you
+are measuring (i.e., add disks to the mix).  The stuff you fill in is
+in lower case, the uppercase stuff you should leave as is.
+
+If you used "vi" and you don't know how to use it, just type ZZ and skip
+this step.
+
+VENDOR:  	i.e. SGI, Compaq, Sun, etc.  For PC clones, just say clone.
+
+MOTHERBOARD: 	this mostly for PC's - it's very important to know there.
+
+MODEL:		SGI O200, Sun Ultra2, Compaq Pressario, Gateway 10,000,000
+
+YEAR BOUGHT:	1982
+
+PRICE:		$10,000
+
+PROCESSORS:
+    NUMBER:	2
+    TYPE:	200 Mhz Pentium Pro
+
+MEMORY:		
+    AMOUNT:	32M, etc.
+    SPEED:	i.e, 60ns, 70ns, etc.
+    TYPE:	FPM, EDO, DIMM, etc
+
+CACHE:
+    ONCHIP DCACHE: 	32K, set associative (2 or 4 way, can't remember)
+    ONCHIP ICACHE: 	32K, set associative (2 or 4 way, can't remember)
+    LEVEL 2:	 	1MB, 2 way set associative, unified
+
+NETWORK:
+    ETHERNET:	100baseT, DEC Tulip chip, SMC PCI card
+    HIPPI:	100MB/sec, 64bit PCI, SGI onboard R4K processors, full duplex
+
+DISKS:
+    		/dev/sda	4GB Quantum, model 1234
+
+MISC:
+		Anything else that you think is interesting for people
+		to know about your system.
diff --git a/performance/lmbench3/scripts/lmbench b/performance/lmbench3/scripts/lmbench
new file mode 100755
index 0000000..53ea511
--- /dev/null
+++ b/performance/lmbench3/scripts/lmbench
@@ -0,0 +1,483 @@
+#!/bin/sh
+
+# lmbench - run the lmbench benchmark suite.
+#
+# Hacked by Larry McVoy (lm@xxxxxxx, lm@xxxxxxx, lm@xxxxxxxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id$
+
+# Make sure we can find: ./cmd, df, and netstat
+PATH=.:../../scripts:$PATH:/etc:/usr/etc:/sbin:/usr/sbin
+export PATH
+
+if [ -f $1 ]
+then	. $1
+	echo Using config in $1 >> ${OUTPUT}
+else	echo Using defaults >> ${OUTPUT}
+	ENOUGH=1000000
+	TIMING_O=0
+	LOOP_O=0
+	LINE_SIZE=512
+fi
+export ENOUGH TIMING_O LOOP_O SYNC_MAX LINE_SIZE LMBENCH_SCHED
+
+if [ X$FILE = X ]
+then	FILE=/tmp/XXX
+	touch $FILE || echo Can not create $FILE >> ${OUTPUT}
+fi
+if [ X$MB = X ]
+then	MB=8
+fi
+AVAILKB=`expr $MB \* 1024`
+
+# Figure out how big we can go for stuff that wants to use
+# all and half of memory.
+HALF="512 1k 2k 4k 8k 16k 32k 64k 128k 256k 512k 1m"
+ALL="$HALF 2m"
+i=4
+while [ $i -le $MB ]
+do
+	ALL="$ALL ${i}m"
+	h=`expr $i / 2`
+	HALF="$HALF ${h}m"
+	i=`expr $i \* 2`
+done
+
+
+if [ X$FSDIR = X ]
+then	FSDIR=/usr/tmp/lat_fs
+fi
+MP=N
+if [ $SYNC_MAX -gt 1 ]
+then	if [ "X$DISKS" != X ]
+	then	echo "MP and disks are mutually exclusive (sorry)"
+		exit 1
+	fi
+	if [ "X$REMOTE" != X ]
+	then	echo "MP and remote networking are mutually exclusive (sorry)"
+		exit 1
+	fi
+	MP=Y
+fi
+
+# Figure out as much stuff as we can about this system.
+# Sure would be nice if everyone had SGI's "hinv".
+echo \[lmbench3.0 results for `uname -a`] 1>&2
+echo \[LMBENCH_VER: <version>] 1>&2
+echo \[BENCHMARK_HARDWARE: ${BENCHMARK_HARDWARE}] 1>&2
+echo \[BENCHMARK_OS: ${BENCHMARK_OS}] 1>&2
+echo \[ALL: ${ALL}] 1>&2
+echo \[DISKS: ${DISKS}] 1>&2
+echo \[DISK_DESC: ${DISK_DESC}] 1>&2
+echo \[ENOUGH: ${ENOUGH}] 1>&2
+echo \[FAST: ${FAST}] 1>&2
+echo \[FASTMEM: ${FASTMEM}] 1>&2
+echo \[FILE: ${FILE}] 1>&2
+echo \[FSDIR: ${FSDIR}] 1>&2
+echo \[HALF: ${HALF}] 1>&2
+echo \[INFO: ${INFO}] 1>&2
+echo \[LINE_SIZE: ${LINE_SIZE}] 1>&2
+echo \[LOOP_O: ${LOOP_O}] 1>&2
+echo \[MB: ${MB}] 1>&2
+echo \[MHZ: ${MHZ}] 1>&2
+echo \[MOTHERBOARD: ${MOTHERBOARD}] 1>&2
+echo \[NETWORKS: ${NETWORKS}] 1>&2
+echo \[PROCESSORS: ${PROCESSORS}] 1>&2
+echo \[REMOTE: ${REMOTE}] 1>&2
+echo \[SLOWFS: ${SLOWFS}] 1>&2
+echo \[OS: ${OS}] 1>&2
+echo \[SYNC_MAX: ${SYNC_MAX}] 1>&2
+echo \[LMBENCH_SCHED: $LMBENCH_SCHED] 1>&2
+echo \[TIMING_O: ${TIMING_O}] 1>&2
+echo \[LMBENCH VERSION: ${VERSION}] 1>&2
+echo \[USER: $USER] 1>&2
+echo \[HOSTNAME: `hostname`] 1>&2
+echo \[NODENAME: `uname -n`] 1>&2
+echo \[SYSNAME: `uname -s`] 1>&2
+echo \[PROCESSOR: `uname -p`] 1>&2
+echo \[MACHINE: `uname -m`] 1>&2
+echo \[RELEASE: `uname -r`] 1>&2
+echo \[VERSION: `uname -v`] 1>&2
+
+echo \[`date`] 1>&2
+echo \[`uptime`] 1>&2
+netstat -i | while read i
+do	echo \[net: "$i"] 1>&2
+	set `echo $i`
+	case $1 in
+	    *ame)	;;
+	    *)		ifconfig $1 | while read i
+			do echo \[if: "$i"] 1>&2
+			done
+			;;
+	esac
+done
+
+mount | while read i
+do	echo \[mount: "$i"] 1>&2
+done
+
+STAT=$FSDIR/lmbench
+mkdir $FSDIR 2>/dev/null
+touch $STAT 2>/dev/null
+if [ ! -f $STAT ]
+then	echo "Can't make a file - $STAT - in $FSDIR" >> ${OUTPUT}
+	touch $STAT
+	exit 1
+fi
+if [ X$SYNC != X ]
+then	/bin/rm -rf $SYNC
+	mkdir -p $SYNC 2>/dev/null
+	if [ ! -d $SYNC ]
+	then	echo "Can't make $SYNC" >> ${OUTPUT}
+		exit 1
+	fi
+fi
+
+date >> ${OUTPUT}
+echo Latency measurements >> ${OUTPUT}
+msleep 250
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_SYSCALL = XYES ]; then
+	lat_syscall -P $SYNC_MAX null
+	lat_syscall -P $SYNC_MAX read
+	lat_syscall -P $SYNC_MAX write
+	lat_syscall -P $SYNC_MAX stat $STAT
+	lat_syscall -P $SYNC_MAX fstat $STAT
+	lat_syscall -P $SYNC_MAX open $STAT
+fi
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_SELECT = XYES ]; then
+	for i in 10 100 250 500
+	do	lat_select -n $i -P $SYNC_MAX file
+	done
+	for i in 10 100 250 500
+	do	lat_select -n $i -P $SYNC_MAX tcp
+	done
+fi
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_SIG = XYES ]; then
+	lat_sig -P $SYNC_MAX install
+	lat_sig -P $SYNC_MAX catch
+	lat_sig -P $SYNC_MAX prot lat_sig
+fi
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_PIPE = XYES ]; then
+	lat_pipe -P $SYNC_MAX 
+fi
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_UNIX = XYES ]; then
+	lat_unix -P $SYNC_MAX
+fi
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_PROC = XYES ]; then
+	cp hello /tmp/hello
+	for i in fork exec shell
+	do	lat_proc -P $SYNC_MAX $i
+	done
+	rm -f /tmp/hello 
+fi
+if [ X$BENCHMARK_HARDWARE = XYES -o X$BENCHMARK_OPS = XYES ]; then
+	lat_ops 
+	par_ops 
+fi
+
+rm -f $FILE
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_FILE = XYES ]; then
+	# choose one sample bandwidth from the middle of the pack
+	sample=`expr $SYNC_MAX / 2`
+	i=0
+	while [ $i -lt $SYNC_MAX ]; do
+		if [ $i -eq $sample ]; then 
+			lmdd label="File $FILE write bandwidth: " \
+				of=$FILE move=${MB}m fsync=1 print=3 &
+		else
+			lmdd label="File $FILE write bandwidth: " \
+				of=$FILE.$i move=${MB}m fsync=1 print=3 \
+				>/dev/null 2>&1 &
+		fi
+		i=`expr $i + 1`
+	done
+	wait
+	rm -f $FILE.*
+fi
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_PAGEFAULT = XYES ]; then
+	lat_pagefault -P $SYNC_MAX $FILE
+fi
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_MMAP = XYES ]; then
+	echo "" 1>&2
+	echo \"mappings 1>&2
+	for i in $ALL
+	do	lat_mmap -P $SYNC_MAX $i $FILE
+	done
+	echo "" 1>&2
+fi
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_FILE = XYES ]; then
+	if [ X$SLOWFS != XYES ]
+	then	date >> ${OUTPUT}
+		echo Calculating file system latency >> ${OUTPUT}
+		msleep 250
+		echo '"File system latency' 1>&2
+		lat_fs $FSDIR
+		echo "" 1>&2
+	fi
+fi
+
+if [ X$BENCHMARK_HARDWARE = XYES ]; then
+	if [ X"$DISKS" != X ]
+	then	for i in $DISKS
+		do	if [ -r $i ]
+			then	echo "Calculating disk zone bw & seek times" \
+					>> ${OUTPUT}
+				msleep 250
+				disk $i
+				echo "" 1>&2
+			fi
+		done
+	fi
+fi
+
+date >> ${OUTPUT}
+echo Local networking >> ${OUTPUT}
+if [ ! -d ../../src/webpage-lm ]
+then	(cd ../../src && tar xf webpage-lm.tar)
+	sync
+	sleep 1
+fi
+SERVERS="lat_udp lat_tcp lat_rpc lat_connect bw_tcp"
+for server in $SERVERS; do $server -s; done
+DOCROOT=../../src/webpage-lm lmhttp 8008 &
+sleep 2;
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_UDP = XYES ]; then
+	lat_udp -P $SYNC_MAX localhost
+fi
+lat_udp -S localhost
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_TCP = XYES ]; then
+	lat_tcp -P $SYNC_MAX localhost
+fi
+lat_tcp -S localhost
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_RPC = XYES ]; then
+	lat_rpc -P $SYNC_MAX -p udp localhost
+	lat_rpc -P $SYNC_MAX -p tcp localhost
+fi
+lat_rpc -S localhost
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_CONNECT = XYES ]; then
+	if [ $SYNC_MAX = 1 ]; then lat_connect localhost; fi
+fi
+lat_connect -S localhost
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_TCP = XYES ]; then
+	echo "" 1>&2
+	echo "Socket bandwidth using localhost" 1>&2
+	for m in 1 64 128 256 512 1024 1437 10M; do
+		bw_tcp -P $SYNC_MAX -m $m localhost; 
+	done
+	echo "" 1>&2
+fi
+bw_tcp -S localhost
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_HTTP = XYES ]; then
+	# I want a hot cache number
+	lat_http localhost 8008 < ../../src/webpage-lm/URLS > /dev/null 2>&1
+	lat_http localhost 8008 < ../../src/webpage-lm/URLS
+fi
+lat_http -S localhost 8008
+
+for remote in $REMOTE 
+do
+	echo Networking to $remote >> ${OUTPUT}
+	$RCP $SERVERS lmhttp ../../src/webpage-lm.tar ${remote}:/tmp
+	for server in $SERVERS
+	do	$RSH $remote -n /tmp/$server -s &
+	done
+	$RSH $remote -n 'cd /tmp; tar xf webpage-lm.tar; cd webpage-lm; ../lmhttp 8008' &
+	sleep 10
+	echo "[ Networking remote to $remote: `$RSH $remote uname -a` ]" 1>&2
+	if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_UDP = XYES ]; then
+		lat_udp -P $SYNC_MAX $remote;
+	fi
+	lat_udp -S $remote;
+
+	if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_TCP = XYES ]; then
+		lat_tcp -P $SYNC_MAX $remote;
+	fi
+	lat_tcp -S $remote;
+
+	if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_RPC = XYES ]; then
+		lat_rpc -P $SYNC_MAX -p udp $remote;
+		lat_rpc -P $SYNC_MAX -p tcp $remote;
+	fi 
+	lat_rpc -S $remote;
+
+	if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_CONNECT = XYES ]; then
+		if [ $SYNC_MAX = 1 ]; then lat_connect $remote; fi
+	fi
+	lat_connect -S $remote;
+
+	if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_TCP = XYES ]; then
+		echo "Socket bandwidth using $remote" 1>&2
+		for m in 1 64 128 256 512 1024 1437 10M; do
+			bw_tcp -P $SYNC_MAX -m $m $remote; 
+		done
+		echo "" 1>&2
+	fi
+	bw_tcp -S $remote 
+
+	if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_HTTP = XYES ]; then
+		# I want a hot cache number
+		lat_http $remote 8008 < ../../src/webpage-lm/URLS > /dev/null 2>&1
+		lat_http $remote 8008 < ../../src/webpage-lm/URLS
+	fi
+	lat_http -S $remote 8008
+
+	RM=
+	for server in $SERVERS
+	do	RM="/tmp/$server $RM"
+	done
+	$RSH $remote rm $RM
+done
+
+date >> ${OUTPUT}
+echo Bandwidth measurements >> ${OUTPUT}
+msleep 250
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_UNIX = XYES ]; then
+	bw_unix -P $SYNC_MAX 
+fi
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_PIPE = XYES ]; then
+	bw_pipe -P $SYNC_MAX 
+fi
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_FILE = XYES ]; then
+	echo "" 1>&2
+	echo \"read bandwidth 1>&2
+	for i in $ALL
+	do	bw_file_rd -P $SYNC_MAX $i io_only $FILE
+	done
+	echo "" 1>&2
+	
+	echo \"read open2close bandwidth 1>&2
+	for i in $ALL
+	do	bw_file_rd -P $SYNC_MAX $i open2close $FILE
+	done
+	echo "" 1>&2
+fi	
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_MMAP = XYES ]; then
+	echo "" 1>&2
+	echo \"Mmap read bandwidth 1>&2
+	for i in $ALL
+	do	bw_mmap_rd -P $SYNC_MAX $i mmap_only $FILE
+	done
+	echo "" 1>&2
+
+	echo \"Mmap read open2close bandwidth 1>&2
+	for i in $ALL
+	do	bw_mmap_rd -P $SYNC_MAX $i open2close $FILE
+	done
+	echo "" 1>&2
+	rm -f $FILE
+fi
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_HARDWARE = XYES \
+     -o X$BENCHMARK_BCOPY = XYES ]; then
+	echo "" 1>&2
+	echo \"libc bcopy unaligned 1>&2
+	for i in $HALF; do bw_mem -P $SYNC_MAX $i bcopy; done; echo "" 1>&2
+
+	echo \"libc bcopy aligned 1>&2
+	for i in $HALF; do bw_mem -P $SYNC_MAX $i bcopy conflict; done; echo "" 1>&2
+
+	echo "Memory bzero bandwidth" 1>&2
+	for i in $ALL; do bw_mem -P $SYNC_MAX $i bzero; done; echo "" 1>&2
+
+	echo \"unrolled bcopy unaligned 1>&2
+	for i in $HALF; do bw_mem -P $SYNC_MAX $i fcp; done; echo "" 1>&2
+
+	echo \"unrolled partial bcopy unaligned 1>&2
+	for i in $HALF; do bw_mem -P $SYNC_MAX $i cp; done; echo "" 1>&2
+
+	echo "Memory read bandwidth" 1>&2
+	for i in $ALL; do bw_mem -P $SYNC_MAX $i frd; done; echo "" 1>&2
+
+	echo "Memory partial read bandwidth" 1>&2
+	for i in $ALL; do bw_mem -P $SYNC_MAX $i rd; done; echo "" 1>&2
+
+	echo "Memory write bandwidth" 1>&2
+	for i in $ALL; do bw_mem -P $SYNC_MAX $i fwr; done; echo "" 1>&2
+
+	echo "Memory partial write bandwidth" 1>&2
+	for i in $ALL; do bw_mem -P $SYNC_MAX $i wr; done; echo "" 1>&2
+
+	echo "Memory partial read/write bandwidth" 1>&2
+	for i in $ALL; do bw_mem -P $SYNC_MAX $i rdwr; done; echo "" 1>&2
+fi
+
+if [ X$BENCHMARK_OS = XYES -o X$BENCHMARK_CTX = XYES ]; then
+	date >> ${OUTPUT}
+	echo Calculating context switch overhead >> ${OUTPUT}
+	msleep 250
+	if [ $MB -ge 8 ]
+	then	CTX="0 4 8 16 32 64"
+		N="2 4 8 16 24 32 64 96"
+	else
+		CTX="0 4 8 16 32"
+		N="2 4 8 16 24 32 64 96"
+	fi
+	
+	echo "" 1>&2
+	for size in $CTX
+	do	
+		lat_ctx -P $SYNC_MAX -s $size $N
+	done
+	echo "" 1>&2
+fi
+
+if [ X$BENCHMARK_HARDWARE = XYES -o X$BENCHMARK_MEM = XYES ]; then
+	if [ $SYNC_MAX = 1 ]; then
+	    date >> ${OUTPUT}
+	    echo Calculating effective TLB size >> ${OUTPUT}
+	    msleep 250
+	    tlb -L $LINE_SIZE -M ${MB}M
+	    echo "" 1>&2
+
+	    date >> ${OUTPUT}
+	    echo Calculating memory load parallelism >> ${OUTPUT}
+	    msleep 250
+	    echo "Memory load parallelism" 1>&2
+	    par_mem -L $LINE_SIZE -M ${MB}M
+	    echo "" 1>&2
+
+#	    date >> ${OUTPUT}
+#	    echo Calculating cache parameters >> ${OUTPUT}
+#	    msleep 250
+#	    cache -L $LINE_SIZE -M ${MB}M
+	fi
+
+	date >> ${OUTPUT}
+	echo McCalpin\'s STREAM benchmark >> ${OUTPUT}
+	msleep 250
+	stream -P $SYNC_MAX -M ${MB}M
+	stream -P $SYNC_MAX -v 2 -M ${MB}M
+
+	date >> ${OUTPUT}
+	echo Calculating memory load latency >> ${OUTPUT}
+	msleep 250
+	echo "" 1>&2
+	echo "Memory load latency" 1>&2
+	if [ X$FASTMEM = XYES ]
+	then    lat_mem_rd -P $SYNC_MAX $MB 128
+	else    lat_mem_rd -P $SYNC_MAX $MB 16 32 64 128 256 512 1024 
+	fi
+	echo "" 1>&2
+	echo "Random load latency" 1>&2
+	lat_mem_rd -t -P $SYNC_MAX $MB 16
+	echo "" 1>&2
+fi
+
+date >> ${OUTPUT}
+echo '' 1>&2
+echo \[`date`] 1>&2
+
+exit 0
diff --git a/performance/lmbench3/scripts/make b/performance/lmbench3/scripts/make
new file mode 100755
index 0000000..59bf238
--- /dev/null
+++ b/performance/lmbench3/scripts/make
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+if [ "X$MAKE" != "X" ] && echo "$MAKE" | grep -q '`'
+then
+    MAKE=
+fi
+
+if [ X$MAKE = X ]
+then	MAKE=make
+	for p in `echo $PATH | sed 's/:/ /g'`
+	do	if [ -f $p/gmake ]
+		then	
+		    if $p/gmake testmake > /dev/null 2>&1
+		    then
+			MAKE=$p/gmake
+		    fi
+		fi
+	done
+fi
+echo $MAKE
diff --git a/performance/lmbench3/scripts/man2html b/performance/lmbench3/scripts/man2html
new file mode 100755
index 0000000..742f69f
--- /dev/null
+++ b/performance/lmbench3/scripts/man2html
@@ -0,0 +1,254 @@
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+# Usage $0 manpage 
+# Parse my man page formats.
+
+die "Usage: $0  [ manpage ] \n" unless $#ARGV <= 0;
+
+$firstSH = 1;
+$inDL = 0;
+
+warn "Doing $ARGV[0]\n";
+
+open(STDIN, "$ARGV[0]") if ($#ARGV == 0);
+
+while (<>) {
+	next if (/^\.\\"/);
+
+	if (/^\.TH\s/) {
+		# .TH BW_MEM_CP 8 "$Date: 00/01/31 15:29:42-08:00 $" "(c)1994 Larry McVoy" "LMBENCH"
+		split;
+		print "<TITLE>$_[1]($_[2]) - LMBENCH man page</TITLE>\n";
+		print "<H2>$_[1]($_[2]) - LMBENCH man page</H2><HR>\n";
+		next;
+	}
+
+	if (/^\.SH\s/) {
+		s/.SH\s+//;
+		s/"//g;
+		chop;
+		print "</DL>\n" unless $firstSH; $firstSH = 0;
+		print "</DL>\n" if $inDL; $inDL = 0;
+		print "<DL><DT><H4>$_</H4><DD>\n";
+		next;
+	}
+
+	next if &fontfont;
+
+	if (/^\.LP\s/ || /^\.PP/) {
+		s/..P\s+//;
+		chop;
+		print "<P>\n";
+		next;
+	}
+
+	if (/^\.TP/) {		# treat as a DT list
+		$_ = <>;
+		&html;
+		chop;
+		print "</DL>\n" if ($inDL);
+		print "<DL><DT>";
+		print unless &fontfont;
+		print "<DD><BR>\n";
+		$inDL = 1;
+		next;
+	}
+
+	if (/^\.IP/) {		# treat as a DT list
+		s/^\.IP\s*//;
+		chop;
+		s/"//;
+		s/".*//;
+		&html;
+		print "</DL>\n" if ($inDL);
+		print "<DL><DT>$_<DD><BR>\n";
+		$inDL = 1;
+		next;
+	}
+
+	if (/^\.sp/) {
+		print "<PRE>\n</PRE>\n";
+		next;
+	}
+
+	next  if (/^\.in/ || /^\.ps/);	# skip this stuff.
+
+	if (/^\.br/) {
+		print "<BR>\n";
+		next;
+	}
+
+	if (/^\.nf/ || /^\.DS/) {		# starting a display
+		print "<PRE>\n";
+		while (<>) {
+			last if /^\.fi/;
+			last if /^\.DE/;
+			next if /^\./;
+			&html;
+			print "\t$_";	# XXX - a screwy way of indenting
+		}
+		print "</PRE>\n";
+		next;
+	}
+
+	if (/^\.ft C[WB]/) {
+		local($pre) = 0;
+
+		print "<CODE>\n";
+		while (<>) {
+			last if /^\.ft\s*$/;
+			if (/^\.nf/) {
+				$pre = 1;
+				print "<PRE>\n";
+				next;
+			}
+			if ($pre && /^\.fi/) {
+				print "</PRE>\n";
+				$pre = 0;
+				next;
+			}
+			next if /^\.br/;
+			&html;
+			print;
+		}
+		print "</CODE>\n";
+		next;
+	}
+
+	if (/\\f\(C[WB]/) {
+		&html;
+		s/\\f\(C[WB]/<CODE>/;
+		while (!/\\f/) {
+			&html;
+			print;
+			$_ = <>;
+		}
+		s/\\fP/<\/CODE>/;
+		print;
+		next;
+	}
+
+	if (/\\fB/) {
+		&html;
+		s/\\fB/<STRONG>/;
+		while (!/\\f/) {
+			print;
+			$_ = <>;
+			&html;
+		}
+		s/\\fP/<\/STRONG>/;
+		print;
+		next;
+	}
+
+	if (/\\fI/) {
+		&html;
+		s/\\fB/<EM>/;
+		while (!/\\f/) {
+			print;
+			$_ = <>;
+			&html;
+		}
+		s/\\fP/<\/EM>/;
+		print;
+		next;
+	}
+
+	if (/^\.ti/) {		# one line display
+		print "<PRE>\n";
+		$_ = <>;
+		&html;
+		print;
+		print "</PRE>\n";
+		next;
+	}
+
+	if (/^\.de\s+/) {
+		s/^\.de\s+//;
+		warn "$ARGV[0]: Ignoring definition: $_";
+		while (<>) {
+			last if /^\.\./;
+		}
+		next;
+	}
+	
+	# Warn about unimplemented troff/man commands
+	if (/^\./) {
+		chop;
+		warn "$ARGV[0] unimp: \"$_\"\n";
+		next;
+	}
+
+	if (/\\f/) {
+		warn "$ARGV[0]: missed font: \"$_\"\n";
+	}
+
+	# Catchall for all the weirdball things I do.
+	s/^\\\&\.\\\|\.\\\|\./.../;
+	s/\\-/-/;
+
+	&html;
+
+	print;
+}
+exit 0;
+
+sub html
+{
+	# HTML things that I've encountered.
+	s/"/&quot;/g;
+	s/</&lt;/g;
+	s/>/&gt;/g;
+}
+
+sub fontfont {
+
+	if (/^\.BI\s/) {
+		s/.BI\s+//;
+		chop;
+		split;
+		print "<STRONG>$_[0]</STRONG><EM>$_[1]</EM>\n";
+		return 1;
+	}
+
+	if (/^\.IB\s/) {
+		s/.IB\s+//;
+		chop;
+		split;
+		print "<EM>$_[0]</EM><STRONG>$_[1]</STRONG>\n";
+		return 1;
+	}
+
+	if (/^\.IR\s/) {
+		s/.IR\s+//;
+		chop;
+		split;
+		print "<EM>$_[0]</EM>$_[1]\n";
+		return 1;
+	}
+
+	if (/^\.BR\s/) {
+		s/.BR\s+//;
+		chop;
+		split;
+		print "<STRONG>$_[0]</STRONG>$_[1]\n";
+		return 1;
+	}
+
+	if (/^\.B\s/) {
+		s/.B\s+//;
+		chop;
+		print "<STRONG>$_</STRONG>\n";
+		return 1;
+	}
+
+	if (/^\.I\s/) {
+		s/.I\s+//;
+		chop;
+		print "<EM>$_</EM>\n";
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/performance/lmbench3/scripts/mkrelease b/performance/lmbench3/scripts/mkrelease
new file mode 100755
index 0000000..be50f03
--- /dev/null
+++ b/performance/lmbench3/scripts/mkrelease
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+# %W%
+#
+# XXX - does not check for checked out files.
+
+make -s clean
+make -s get
+VERS=`egrep 'MAJOR|MINOR' src/version.h | awk '{print $3}'` 
+set `echo $VERS`
+if [ $2 -lt 0 ] 
+then	VERS=`echo $1$2 | sed s/-/alpha/`
+else	VERS=`echo $VERS |sed 's/ /./'`
+fi
+D=lmbench-$VERS
+mkdir $D $D/results
+cp -rp SCCS doc hbench-REBUTTAL lmbench-HOWTO scripts src $D
+cp -rp results/SCCS $D/results
+(cd $D && make -s get)
+/bin/rm -rf $D/SCCS $D/*/SCCS
+tar czvf $D.tgz $D
+/bin/rm -rf $D
+make -s clean
diff --git a/performance/lmbench3/scripts/new2oldctx b/performance/lmbench3/scripts/new2oldctx
new file mode 100755
index 0000000..3e2ed48
--- /dev/null
+++ b/performance/lmbench3/scripts/new2oldctx
@@ -0,0 +1,31 @@
+
+# Convert the new format:
+# Context switch of 8 4k processes: 64.17 (60.02 overhead)
+# to the old format:
+#"size=0 ovr=22
+# 2 8
+# 4 14
+# 8 18
+# 16 21
+# 20 22
+
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+@lines = grep(/Context switch/, <>);
+foreach $size ("0k", "4k", "16k", "32k", "64k") {
+	@data = grep(/$size/, @lines);
+	@a = @b = @c = ();
+	$i = 0;
+	foreach $n (2, 4, 8, 16, 20) {
+		@tmp = ();
+		foreach $_ (grep(/of $n/, @data)) {
+			@_ = split;
+			push(@tmp, "$_[3] $_[6]\n");
+		}
+		($a[$i],$b[$i],$c[$i]) = @tmp;
+		$i++;
+	}
+	print "\n\"size=$size \n";
+	print @c;
+}
diff --git a/performance/lmbench3/scripts/opercent b/performance/lmbench3/scripts/opercent
new file mode 100755
index 0000000..8f34c1e
--- /dev/null
+++ b/performance/lmbench3/scripts/opercent
@@ -0,0 +1,92 @@
+
+eval "exec perl -sS $0 $*"
+	if 0;
+
+$fmt = 0;
+@fmts = (
+"%33s %4s %4s %3s %4s %4s %4s %4s %4s %4s\n",
+"%28s %6s %6s %5s %6s %7s %7s\n",
+"%29s %5s %4s %5s %5s %5s %5s %4s\n",
+"%30s %6s %6s %6s %8s %5s %7s\n",
+"%28s %4s %4s %6s %6s %6s %6s %4s %5s\n",
+"%29s %5s %6s %11s\n",
+);
+while (<>) {
+	print;
+	next unless /^Host/;
+	$_ = <>; print;
+	unless (/^-/) {
+		$_ = <>; print;
+	}
+	@values = ();
+	@a = @b = @c = @d = @e = @f = @g = @h = @i = @j = @k = ();
+	$i = 0;
+	while (<>) {
+		last if /^\s/;
+		print;
+		s/.......................\s+//;
+		($a[$i],$b[$i],$c[$i],$d[$i],$e[$i],$f[$i],$g[$i],$h[$i],$i[$i],$j[$i],$k[$i]) = split;
+		$i++;
+	}
+	$a = &sss(@a) if $#a != -1;
+	$b = &sss(@b) if $#b != -1;
+	$c = &sss(@c) if $#c != -1;
+	$d = &sss(@d) if $#d != -1;
+	$e = &sss(@e) if $#e != -1;
+	$f = &sss(@f) if $#f != -1;
+	$g = &sss(@g) if $#g != -1;
+	$h = &sss(@h) if $#h != -1;
+	$i = &sss(@i) if $#i != -1;
+	$j = &sss(@j) if $#j != -1;
+	$k = &sss(@k) if $#k != -1;
+	printf $fmts[$fmt],  $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k;
+	print "\n";
+	exit if $fmt++ == $#fmts;
+}
+
+sub sss
+{
+	local($tmp);
+	local(@values) = ();
+	local($n, $sum, $min, $max) = (0,0,1.7E+300,2.2E-300);
+
+	foreach $_ (@_) {
+		next unless /^\d/;
+		chop if /K$/;
+		push(@values, $_);
+		$sum += $_;
+		$min = $_ if $_ < $min;
+		$max = $_ if $_ > $max;
+		$n++;
+	}
+	return "" if $#values == -1;
+	# Do some statistics.
+	@s = sort(@values);
+	if ($n & 1) {	
+		$median = $s[($n + 1)/2];
+	} else {
+		$i = $n / 2;
+		$median = ($s[$i] + $s[$i+1]) / 2;
+	}
+	$avg = $sum/$n;
+	$avgdev = $var = 0;
+	foreach $_ (@values) {
+		$var += ($_ - $median) ** 2;
+		$tmp = $_ - $median;
+		$avgdev += $tmp > 0 ? $tmp : -$tmp;
+	}
+	$var /= $n - 1;
+	$stddev = sqrt($var);
+	$avgdev /= $n;
+	#printf("%8s %8s %8s %8s %8s %4s %8s\n", "Min", "Max", "Average", "Median", "Std Dev", "%", "Avg Dev");
+	#printf "%8.2f %8.2f %8.2f %8.2f %8.2f %4.1f%% %8.2f\n", $min, $max, $avg, $median, $stddev, $stddev/$median*100, $avgdev;
+	$percent = $stddev/$median*100;
+	if ($percent > 90) {
+		printf "Huh: $percent $stddev $median @values\n";
+	}
+	if ($percent >= 10) {
+		return sprintf "%.0f%%", $percent;
+	} else {
+		return sprintf "%.1f%%", $percent;
+	}
+}
diff --git a/performance/lmbench3/scripts/os b/performance/lmbench3/scripts/os
new file mode 100755
index 0000000..ea767c6
--- /dev/null
+++ b/performance/lmbench3/scripts/os
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+if [ "X$OS" != "X" ] && echo "$OS" | grep -q '`'
+then
+    OS=
+fi
+
+if [ "X$OS" = "X" ]
+then	OS=bloat-os
+    MACHINE=`uname -m | sed -e 's/ //g' | sed -e 's?/?-?g'`
+    SYSTEM=`uname -s | sed -e 's/ //g' | sed -e 's?/?-?g'`
+    OS="${MACHINE}-${SYSTEM}"
+    if [ -f ../scripts/gnu-os ]
+    then	OS=`../scripts/gnu-os | sed s/unknown-//`
+    fi
+    if [ -f ../../scripts/gnu-os ]
+    then	OS=`../../scripts/gnu-os | sed s/unknown-//`
+    fi
+fi
+echo $OS
diff --git a/performance/lmbench3/scripts/output b/performance/lmbench3/scripts/output
new file mode 100755
index 0000000..2a204e3
--- /dev/null
+++ b/performance/lmbench3/scripts/output
@@ -0,0 +1,10 @@
+#!/bin/sh
+trap "echo /dev/null" 20
+OUTPUT=/dev/null; export OUTPUT
+if [ -w /dev/tty ]; then
+        if echo "" > /dev/tty; then
+                OUTPUT=/dev/tty; export OUTPUT
+        fi
+fi 2>/dev/null
+echo "${OUTPUT}"
+exit 0
diff --git a/performance/lmbench3/scripts/percent b/performance/lmbench3/scripts/percent
new file mode 100755
index 0000000..9b98cd9
--- /dev/null
+++ b/performance/lmbench3/scripts/percent
@@ -0,0 +1,95 @@
+
+eval "exec perl -sS $0 $*"
+	if 0;
+
+$fmt = 0;
+@fmts = (
+"%24s %4s %4s %3s %4s %5s %4s %4s %4s %5s %4s %3s\n",
+"%24s %4s %6s %5s %5s %6s %7s %7s\n",
+"%24s %4s %5s %4s %5s %5s %5s %5s %4s\n",
+"%24s %6s %6s %6s %8s %5s %7s\n",
+"%24s %3s %4s %4s %6s %7s %6s %5s %5s %5s\n",
+"%24s %5s %5s %5s %12s\n",
+);
+while (<>) {
+	print;
+	next unless /^Host/;
+	$_ = <>; print;
+	unless (/^-/) {
+		$_ = <>; print;
+	}
+	@values = ();
+	@a = @b = @c = @d = @e = @f = @g = @h = @i = @j = @k = ();
+	$i = 0;
+	while (<>) {
+		last if /^\s/;
+		print;
+		s/.......................\s+//;
+		($a[$i],$b[$i],$c[$i],$d[$i],$e[$i],$f[$i],$g[$i],$h[$i],$i[$i],$j[$i],$k[$i]) = split;
+		$i++;
+	}
+	$a = &sss(@a) if $#a != -1;
+	$b = &sss(@b) if $#b != -1;
+	$c = &sss(@c) if $#c != -1;
+	$d = &sss(@d) if $#d != -1;
+	$e = &sss(@e) if $#e != -1;
+	$f = &sss(@f) if $#f != -1;
+	$g = &sss(@g) if $#g != -1;
+	$h = &sss(@h) if $#h != -1;
+	$i = &sss(@i) if $#i != -1;
+	$j = &sss(@j) if $#j != -1;
+	$k = &sss(@k) if $#k != -1;
+	printf $fmts[$fmt],  "", $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k;
+	print "\n";
+	exit if $fmt++ == $#fmts;
+}
+
+sub sss
+{
+	local($i, $tmp);
+	local(@values) = ();
+	local($n, $sum, $min, $max) = (0,0,1.7E+300,2.2E-300);
+
+	foreach $_ (@_) {
+		next unless /^\d/;
+		chop if /K$/;
+		push(@values, $_);
+		$sum += $_;
+		$min = $_ if $_ < $min;
+		$max = $_ if $_ > $max;
+		$n++;
+	}
+	return "" if $#values == -1;
+	# Do some statistics.
+	@s = sort(@values);
+	if ($n & 1) {	
+		$median = $s[($n + 1)/2];
+	} else {
+		$i = $n / 2;
+		$median = ($s[$i] + $s[$i+1]) / 2;
+	}
+	$avg = $sum/$n;
+	$avgdev = $var = 0;
+	foreach $_ (@values) {
+		$var += ($_ - $median) ** 2;
+		$tmp = $_ - $median;
+		$avgdev += $tmp > 0 ? $tmp : -$tmp;
+	}
+	$var /= $n - 1;
+	$stddev = sqrt($var);
+	$avgdev /= $n;
+	#printf("%8s %8s %8s %8s %8s %4s %8s\n", "Min", "Max", "Average", "Median", "Std Dev", "%", "Avg Dev");
+	#printf "%8.2f %8.2f %8.2f %8.2f %8.2f %4.1f%% %8.2f\n", $min, $max, $avg, $median, $stddev, $stddev/$median*100, $avgdev;
+	$percent = $stddev/$median*100;
+	if ($percent > 90) {
+		printf "Huh: $percent $stddev $median @values\n";
+	}
+	if ($percent < .5) {
+		return "0 ";
+	} elsif ($percent < 1) {
+		$tmp = sprintf "%.1f%%", $percent;
+		return $tmp;
+	} else {
+		return sprintf "%.0f%%", $percent;
+	}
+}
diff --git a/performance/lmbench3/scripts/rccs b/performance/lmbench3/scripts/rccs
new file mode 100755
index 0000000..def0785
--- /dev/null
+++ b/performance/lmbench3/scripts/rccs
@@ -0,0 +1,733 @@
+
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+# Mimic the BSD tool, sccs, for RCS.
+# $Id: rccs 1.7 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $
+#
+# Note - this reflects a lot of my personal taste.  I'll try and list the
+# important differences here:
+#
+# A bunch of unused commands are not implemented.  It is easy to add them,
+# mail me if you want me to add something.  Please include a spec of what
+# you want the command to do.  Mail lm@xxxxxxxxxxxx.
+#
+# I look at RCS file internals and know about certain fields as of revision
+# 5.x.
+#
+# This interface does not require a list of files/directories for most
+# commands; the implied list is *,v and/or RCS/*,v.  Destructive commands,
+# such as clean -f, unedit, unget, do *not* have an implied list.  In
+# other words, 
+#	rccs diffs	is the same as		rccs diffs RCS
+# but
+#	rccs unedit	is not the same as	rccs unedit RCS
+#
+# If you add (potentially) destructive commands, please check for
+# them in main() and make sure that the autoexpand does not happen.
+#
+# TODO:
+#	Make it so that you can pass a list of files/dirs via stdin.
+#
+#	It might be nice to have all the "system" args printed out in
+#	verbose and/or learn mode.  Depends on whether you want people
+#	to learn RCS or not.
+
+&init;
+&main;
+exit 0;	# probably not reached.
+
+sub init
+{
+	$0 =~ s|.*/||;
+	# Add commands here so that -w shuts up.
+	$lint = 0;
+
+	&clean() && &create() && &example() && &get() && &edit() &&
+	&unedit() && &unget() && &diffs() && &delta() && &help() &&
+	&prs() && &prt() && &deledit() && &delget() && &enter() &&
+	&info() && &ci() && &co() && &fix() && &print()
+	    if $lint;
+}
+
+sub help
+{
+	if ($#_ == -1) {
+		&usage;
+	}
+	
+	# Handle all the aliases.
+	if ($_[0] eq "unedit" || $_[0] eq "unget") {
+		&help("clean");
+	} elsif ($_[0] eq "clean") {
+	}
+	warn "Extended help on @_ not available yet.\n";
+}
+
+sub usage
+{
+print <<EOF;
+
+usage: $0 [$0 opts] command [args] [file and/or directory list]
+
+$0 options are:
+    -debug	for debugging of $0 itself
+    -verbose	for more information about what $0 is doing
+
+More information may be had by saying "$0 help subcommand".
+
+Most commands take "-s" to mean do the work silently.
+
+Command		Effect
+-------		------
+    clean -	remove unedited (ro) working files
+	-e	remove unmodified edited (rw) & unedited (ro) files
+	-f	(force) remove modified working files as well
+    create -	add a set of files to RCS control and get (co) the working files
+	-g	do not do the get (co) of the working files
+	-y<msg>	use <msg> as the description message (aka -d<msg>)
+    delta -	check in a revision
+	-y<msg>	use <msg> as the log message (aka -d<msg>)
+	-s
+    diffs -	diff the working file against the RCS file
+    fix -	redit the last revision
+    get -	get the working file[s] (possibly for editing)
+    history -	print history of the files
+    print -	print the history and the latest contents
+
+Alias		Real command	Effect
+-----		------------	------
+    ci -	delta		check in a revision
+    co -	get		check out a revision
+    enter -	create -g	initialize a file without a get afterward
+    unedit -	clean -f	remove working file even if modified
+    unget -	clean -f	remove working file even if modified
+    edit -	get -e		check out the file for editing
+    prs -	history		print change log history
+    prt -	history		print change log history
+
+An implied list of *,v and/or RCS/*,v is implied for most commands.
+The exceptions are commands that are potentially destructive, such as
+unedit.
+
+EOF
+
+	exit 0;
+}
+
+sub main
+{
+	local($cmd);
+	local(@args);
+	local(@comma_v);
+
+	$cmd = "oops";
+	$cmd = shift(@ARGV) if $#ARGV > -1;
+	&help(@ARGV) if $cmd eq "help" || $cmd eq "oops";
+
+	$dir_specified = $file_specified = 0;
+	foreach $_ (@ARGV) {
+		# If it is an option, just pass it through.
+		if (/^-/) {
+			push(@args, $_);
+		}
+		# If they specified an RCS directory, explode it into ,v files.
+		elsif (-d $_) {
+			$dir_specified = 1;
+			warn "Exploding $_\n" if $debug;
+			push(@args, grep(/,v$/, &filelist($_)));
+			push(@args, grep(/,v$/, &filelist("$_/RCS")));
+		}
+		# If it is a file, make it be the ,v file.
+		else {
+			if (!/,v$/) {
+				# XXX - what if both ./xxx,v and ./RCS/xxx,v?
+				if (-f "$_,v") {
+					$_ .= ",v";
+				} else {
+					if (m|/|) {
+						m|(.*)/(.*)|;
+						$f = "$1/RCS/$2,v";
+					} else {
+						$f = "RCS/$_,v";
+					}
+					if (-f $f) {	
+						$_ = $f;
+					}
+				}
+			}
+			if (-f $_) {
+				$file_specified = 1;
+				warn "Adding $_\n" if $debug;
+				push(@args, $_);
+			} else {
+				warn "$0: skipping $_, no RCS file.\n";
+			}
+		}
+	}
+
+	# Figure out if it is a potentially destructive command.  These
+	# commands do not automagically expand *,v and RCS/*,v.
+	$destructive = ($cmd eq "clean" && $args[0] eq "-f") ||
+	    $cmd eq "unedit" || $cmd eq "unget";
+        
+	# If they didn't specify a file or a directory, generate a list
+	# of all ./*,v and ./RCS/*,v files.
+	unless ($destructive || $dir_specified || $file_specified) {
+		warn "Exploding . && ./RCS\n" if $debug;
+		push(@args, grep(/,v$/, &filelist(".")));
+		push(@args, grep(/,v$/, &filelist("RCS")));
+	}
+
+	unless ($cmd =~ /^create$/) {
+		@comma_v = grep(/,v$/, @args);
+		if ($#comma_v == -1) {
+			($s = "$cmd @ARGV") =~ s/\s+$//;
+			die "$0 $s: No RCS files specified.\n";
+		}
+	}
+	
+	# Exit codes:
+	#	0 - it worked
+	#	1 - unspecified error
+	#	2 - command unknown
+	$exit = 2;
+	warn "Trying &$cmd(@args)\n" if $debug;
+	eval(&$cmd(@args));
+
+	if ($exit == 2) {
+		warn "Possible unknown/unimplemented command: $cmd\n";
+		&usage;
+	} else {
+		exit $exit;
+	}
+}
+
+# Read the directory and return a list of files.
+# XXX - isn't there a builtin that does this?
+sub filelist
+{
+	local(@entries) = ();
+	local($ent);
+
+	opendir(DFD, $_[0]) || return ();
+	foreach $ent (readdir(DFD)) {
+		$ent = "$_[0]/$ent";
+		next unless -f $ent;
+		push(@entries, $ent);
+	}
+	warn "filelist($_[0]): @entries\n" if $debug;
+	@entries;
+}
+
+# Take a list of ,v files and return a list of associated working files.
+sub working
+{
+	local(@working, $working) = ();
+
+	foreach $comma_v (@_) {
+		# Strip the ,v.
+		# Strip the RCS specification.
+		($working = $comma_v) =~ s|,v$||;
+		$working =~ s|RCS/||;
+		push(@working, $working);
+	}
+	@working;
+}
+
+# Same as "clean -f" - throw away all changes
+sub unedit { &clean("-f", @_); }
+sub unget { &clean("-f", @_); }
+
+# Get rid of everything that isn't edited and has an associated RCS file.
+# -e	remove edited files that have not been changed.
+# -f	remove files that are edited with changes (CAREFUL!)
+#	This implies the -e opt.
+# -d<m>	Check in files that have been modified.  If no message, prompt
+#	on each file.  This implies -e.
+# -y<m>	Like -d for people that are used to SCCS.
+# -m<m>	Like -d for people that are used to RCS.
+#
+# Note: this does not use rcsclean; I don't know when that showed up.  And
+# the 5.x release of RCS I have does not install it.
+sub clean
+{
+	local(@working);
+	local($e_opt, $f_opt, $d_opt, $s_opt) = (0,0,0,0);
+	local($msg);
+	local(@checkins) = ();
+
+	while ($_[0] =~ /^-/) {
+		if ($_[0] eq "-s") {
+			$s_opt = 1;
+			shift(@_);
+		} elsif ($_[0] eq "-e") {
+			$e_opt = 1;
+			shift(@_);
+		} elsif ($_[0] eq "-f") {
+			$f_opt = $e_opt = 1;
+			shift(@_);
+		} elsif ($_[0] =~ /^-[dym]/) {
+			$d_opt = $e_opt = 1;
+			if ($_[0] =~ /^-[dym]$/) {
+				$msg = $_[0];
+			} else {
+				($msg = $_[0]) =~ s/-[ydm]//;
+				$msg = "-m'" . $msg . "'";
+			}
+			shift(@_);
+		} else {
+			die "$0 clean: unknown option: $_[0]\n";
+		}
+	}
+
+	@working = &working(@_);
+	for ($i = 0; $i <= $#_; ++$i) {
+		# No working file?
+		if (!-f $working[$i]) {
+			warn "No working file $working[$i] for $_[$i]\n"
+			    if $debug;
+			next;
+		}
+
+		# Read only?  Unlink.
+		if (!-w $working[$i]) {
+			warn "rm $working[$i]\n" unless $s_opt;
+			# Make sure there is an RCS file
+			if (-f $_[$i]) {
+				# XXX - what if ro and edited?
+				unlink($working[$i]) unless $n;
+			} else {
+				warn "clean: no RCS file for $working[$i]\n";
+			}
+			next;
+		}
+
+		# If they just want to know about it, tell them.
+		if ($e_opt == 0) {
+			open(RCS, $_[$i]);
+			while (defined($r = <RCS>)) {
+				last if $r =~ /locks/;
+			}
+			@locks = ();
+			while (defined($r = <RCS>)) {
+				# XXX - I use "comment" a delimiter.
+				last if $r =~ /comment/;
+				$r =~ s/^\s+//;
+				chop($r);
+				push(@locks, $r);
+			}
+			close(RCS);
+			if ($#locks > -1) {
+				warn "$working[$i]: being edited: @locks\n";
+			} else {
+				warn "$working[$i]: " .
+				    "writeable but not edited?!?\n";
+			}
+			next;
+		}
+
+		# See if there have actually been any changes.
+		# Notice that this is cmp(1) in about 10 lines of perl!
+		open(RCS, "co -q -p -kkvl $_[$i] |");
+		open(WORK, $working[$i]);
+		$diff = 0;
+		while (defined($r = <RCS>)) {
+			unless (defined($w = <WORK>) && ($r eq $w)) {
+				$diff = 1;
+				last;
+			}
+		}
+		if (defined($w = <WORK>)) {
+			$diff = 1;
+		}
+		close(RCS); close(WORK);
+		if ($diff) {
+			if ($f_opt) {
+				warn "Clean modified $working[$i]\n"
+				    unless $s_opt;
+				unless ($n) {
+					unlink($working[$i]);
+					system "rcs -q -u $_[$i]";
+				}
+			} elsif ($d_opt) {
+				push(@checkins, $_[$i]);
+			} else {
+				warn "Can't clean modified $working[$i]\n";
+			}
+			next;
+		} else {
+			warn "rm $working[$i]\n" unless $s_opt;
+			unless ($n) {
+				unlink($working[$i]);
+				system "rcs -q -u $_[$i]";
+			}
+		}
+	}
+
+	# Handle files that needed deltas.
+	if ($#checkins > -1) {
+		warn "ci -q $msg @checkins\n" if $verbose;
+		system "ci -q $msg @checkins";
+	}
+
+	$exit = 0;
+}
+
+# Create - initialize the RCS file
+# -y<c>	- use <c> as the description message for all files.
+# -d<c>	- use <c> as the description message for all files.
+# -g	- don't do the get
+#
+# Differs from sccs in that it does not preserve the original
+# files (I never found that very useful).
+sub create
+{
+	local($arg, $noget, $description, $cmd) = ("", "", "");
+
+	foreach $arg (@_) {
+		# Options...
+		if ($arg =~ /^-[yd]/) {
+			($description = $arg) =~ s/^-[yd]//;
+			$arg = "";
+			warn "Desc: $description\n" if $debug;
+			next;
+		}
+		if ($arg eq "-g") {
+			$noget = "yes";
+			$arg = "";
+			next;
+		}
+		next if ($arg =~ /^-/);
+
+		# If no RCS subdir, make one.
+		if ($arg =~ m|/|) {	# full path
+			($dir = $arg) =~ s|/[^/]+$||;
+			mkdir("$dir/RCS", 0775);
+		} else {		# in $CWD
+			mkdir("RCS", 0775);
+		}
+	}
+	$exit = 0;
+	if ($description ne "") {
+		$cmd = "ci -t-'$description' @_";
+	} else {
+		$cmd = "ci @_";
+	}
+	warn "$cmd\n" if $verbose;
+	system "$cmd";
+	system "co @_" unless $noget;
+}
+
+# Like create without the get.
+sub enter { &create("-g", @_); }
+
+# Edit - get the working file editable
+sub edit { &get("-e", @_); }
+
+# co - normal RCS
+sub co { &get(@_); }
+
+# Get - get the working file
+# -e	Retrieve a version for editing.
+#	Same as co -l.
+# -p    Print the file to stdout.
+# -k	Suppress expansion of ID keywords.
+#	Like co -kk.
+# -s	Suppress all output.
+#
+# Note that all other options are passed to co(1).
+sub get
+{
+	local($arg, $working, $f, $p);
+
+	$f = $p = 0;
+	foreach $arg (@_) {
+		# Options...
+		$arg = "-l" if ($arg eq "-e");
+		$arg = "-kk" if ($arg eq "-k");
+		$arg = "-q" if ($arg eq "-s");
+		$f = 1 if ($arg eq "-f");
+		$p = 1 if ($arg eq "-p");	# XXX - what if -sp?
+
+		next if $arg =~ /^-/ || $p;
+
+		# Check for writable files and skip them unless someone asked
+		# for co's -f option.
+		($working = $arg) =~ s|,v$||;
+		$working =~ s|RCS/||;
+		if ((-w $working) && $f == 0) {
+			warn "ERROR [$arg]: writable `$working' exists.\n";
+			$arg = "";
+		}
+	}
+	@files = grep(/,v/, @_);
+	if ($#files == -1) {
+		warn "$0 $cmd: no files to get. @_\n";
+		$exit = 1;
+	} else {
+		system "co @_";
+		$exit = 0;
+	}
+}
+
+# Aliases for history.
+sub prt { &history(@_); }
+sub prs { &history(@_); }
+
+# History - change history sub command
+sub history
+{
+	local(@history);
+
+	open(RL, "rlog @_|");
+	# Read the whole history
+	while (defined($r = <RL>)) {
+		# Read the history for one file.
+		if ($r !~ /^[=]+$/) {
+			push(@history, $r);
+			next;
+		}
+		&print_history(@history);
+		@history = ();
+	}
+	close(RL);
+	print "+-----------------------------------\n";
+	$exit = 0;
+}
+
+sub print_history
+{
+	for ($i = 0; $i <= $#_; ++$i) {
+		# Get the one time stuff
+		if ($_[$i] =~ /^RCS file:/) {
+			$_[$i] =~ s/RCS file:\s*//;
+			chop($_[$i]);
+			print "+------ $_[$i] -------\n|\n";
+		}
+
+		# Get the history
+		if ($_[$i] =~ /^----------------------------/) {
+			local($rev, $date, $author, $lines) = ("", "", "", "");
+
+			$i++;
+			die "Bad format\n" unless $_[$i] =~ /revision/;
+			$_[$i] =~ s/revision\s+//;
+			chop($_[$i]);
+			$rev = $_[$i];
+			$i++;
+			die "Bad format\n" unless $_[$i] =~ /date/;
+			@parts = split(/[\s\n;]+/, $_[$i]);
+			for ($j = 0; $j <= $#parts; $j++) {
+				if ($parts[$j] =~ /date/) {
+					$j++;
+					$date = "$parts[$j] ";
+					$j++;
+					$date .= "$parts[$j]";
+				}
+				if ($parts[$j] =~ /author/) {
+					$j++;
+					$author = $parts[$j];
+				}
+				if ($parts[$j] =~ /lines/) {
+					$j++;
+					$lines = "$parts[$j] ";
+					$j++;
+					$lines .= "$parts[$j]";
+				}
+			}
+			print "| $rev $date $author $lines\n";
+			while ($_[++$i] &&
+			    $_[$i] !~ /^----------------------------/) {
+			    	print "| $_[$i]"; ### unless $rev =~ /^1\.1$/;
+			}
+			print "|\n";
+			$i--;
+		}
+	}
+}
+
+# Show changes between working file and RCS file
+#
+# -C -> -c for compat with sccs (not sure if this is needed...).
+sub diffs
+{
+	local(@working);
+	local($diff) = "diff";
+	local($rev) = "";
+
+	while ($_[0] =~ /^-/) {
+		if ($_[0] eq "-C") {
+			$diff .= " -c";
+			shift(@_);
+		} elsif ($_[0] =~ /^-r/) {
+			$rev = $_[0];
+			shift(@_);
+		} elsif ($_[0] eq "-sdiff") {
+			$TIOCGWINSZ = 1074295912;       # IRIX 5.x, 6.x, and SunOS 4.x.  Cool.
+			$buf = "abcd";
+			if (ioctl(STDIN, $TIOCGWINSZ, $buf)) { 
+        			($row, $col) = unpack("ss", $buf);
+				$wid = $col;
+				$row = 1 if 0;	# lint
+			} else {
+				$wid = 80;
+			}
+			$diff = "sdiff -w$wid";
+			shift(@_);
+		} else {
+			$diff .= " $_[0]";
+			shift(@_);
+		}
+			
+	}
+
+	@working = &working(@_);
+	for ($i = 0; $i <= $#_; ++$i) {
+		# No working file?
+		if (!-f $working[$i]) {
+			warn "No working file $working[$i] for $_[$i]\n"
+			    if $debug;
+			next;
+		}
+
+		# Read only?  Skip.
+		next unless (-w $working[$i]);
+
+		# Show the changes
+		select(STDOUT); $| = 1;
+		print "\n------ $working[$i]$rev ------\n";
+		$| = 0;
+		# XXX - flush stdout.
+		if ($diff =~ /^sdiff/) {
+			system "co -q -p -kkvl $rev $_[$i] > /tmp/sdiff.$$" .
+			    "&& $diff /tmp/sdiff.$$ $working[$i]";
+			# XXX - interrupts?
+			unlink("/tmp/sdiff.$$");
+		} else {
+			system "co -q -p -kkvl $rev $_[$i] |" .
+			    " $diff - $working[$i]";
+		}
+	}
+
+	$exit = 0;
+}
+	
+# delta - check in the files
+sub delta
+{
+	local($description) = ("");
+	local($i, @working);
+
+	@working = &working(@_);
+	for ($i = 0; $i <= $#_; ++$i) {
+		# Options...
+		if ($_[$i] =~ /^-[yd]/) {
+			($description = $_[$i]) =~ s/^-[yd]/-m/;
+			$description = "'" . $description . "'";
+			$_[$i] = "";
+			next;
+		}
+		$_[$i] = "-q" if $_[$i] eq "-s";
+		$_[$i] = "" unless -f $working[$i];
+	}
+	$exit = 0;
+	warn "ci $description @_\n" if $verbose;
+	system "ci $description @_";
+}
+
+# Allow RCS interface ci
+sub ci
+{
+	&delta(@_);
+}
+
+# delget
+sub delget
+{
+	&delta(@_);
+	&get(@_);	# If there was a description, delta nuked it...
+}
+
+# deledit
+sub deledit
+{
+	&delta(@_);
+	&get("-e", @_);	# If there was a description, delta nuked it...
+}
+
+
+# info - who is editing what
+sub info
+{
+	local(@working);
+
+	@working = &working(@_);
+	for ($i = 0; $i <= $#_; $i++) {
+		open(RCS, $_[$i]);
+		while (defined($r = <RCS>)) {
+			last if $r =~ /locks/;
+		}
+		@locks = ();
+		while (defined($r = <RCS>)) {
+			# XXX - I use "comment" a delimter.
+			last if $r =~ /comment/;
+			$r =~ s/^\s+//;
+			chop($r);
+			push(@locks, $r);
+		}
+		close(RCS);
+		if ($#locks > -1) {
+			warn "$working[$i]: being edited: @locks\n";
+	    	}
+	}
+	$exit = 0;
+}
+
+# Fix - fix the last change to a file
+sub fix
+{
+	foreach $f (@_) {
+		next unless -f $f;
+		open(F, $f);
+		while (defined(<F>)) { last if /head\s\d/; } close(F);
+		unless ($_ && /head/) {
+			warn "$0 $cmd: No head node found in $f\n";
+			next;
+		}
+		s/head\s+//; chop; chop; $rev = $_;
+		($working = $f) =~ s/,v//;
+		$working =~ s|RCS/||;
+		system "co -q $f && rcs -o$rev $f && rcs -l $f && chmod +w $working";
+	}
+	$exit = 0;
+}
+
+# print - print the history and the latest revision of the file
+sub print
+{
+	local($file);
+
+	foreach $file (@_) {
+		&history($file);
+		&get("-s", "-p", $file);
+	}
+	$exit = 0;
+}
+
+
+# Example - example sub command
+# -Q	change this option to -q just to show how.
+sub example
+{
+	local($arg, $working);
+
+	foreach $arg (@_) {
+		# Options...
+		$arg = "-Q" if ($arg eq "-q");
+	}
+	warn "rlog @_\n" if $verbose;
+	system "rlog @_";
+	$exit = 0;
+}
+
diff --git a/performance/lmbench3/scripts/results b/performance/lmbench3/scripts/results
new file mode 100755
index 0000000..cd07c15
--- /dev/null
+++ b/performance/lmbench3/scripts/results
@@ -0,0 +1,39 @@
+#!/bin/sh
+
+# $Id$
+
+OS=`../scripts/os`
+CONFIG=`../scripts/config`
+RESULTS=results/$OS
+BASE=../$RESULTS/`uname -n`
+EXT=0
+
+if [ ! -f "../bin/$OS/$CONFIG" ]
+then	echo "No config file?"
+	exit 1
+fi
+. ../bin/$OS/$CONFIG
+
+if [ ! -d ../$RESULTS ]
+then	mkdir -p ../$RESULTS
+fi
+RESULTS=$BASE.$EXT
+while [ -f $RESULTS ]
+do      EXT=`expr $EXT + 1`
+	RESULTS=$BASE.$EXT
+done
+
+cd ../bin/$OS 
+PATH=.:${PATH}; export PATH
+export SYNC_MAX
+export OUTPUT
+lmbench $CONFIG 2>../${RESULTS}
+
+if [ X$MAIL = Xyes ]
+then	echo Mailing results
+	(echo ---- $INFO ---
+	cat $INFO 
+	echo ---- $RESULTS ---
+	cat ../$RESULTS) | mail lmbench3@xxxxxxxxxxxx 
+fi
+exit 0
diff --git a/performance/lmbench3/scripts/save b/performance/lmbench3/scripts/save
new file mode 100755
index 0000000..cf61997
--- /dev/null
+++ b/performance/lmbench3/scripts/save
@@ -0,0 +1,26 @@
+# Save the input in the specified file if possible.  If the file exists,
+# add a numeric suffice, i.e., .1, and increment that until the file
+# does not exist.  Use the first name found as the file to save.
+#
+# Typical usage is: xroff -man -fH *.1 | save MAN.PS
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: save 1.4 00/01/31 15:29:42-08:00 lm@xxxxxxxxxxxxxxx $
+eval 'exec perl -Ssw $0 "$@"'
+	if 0;
+
+$base = $#ARGV == 0 ? shift : "save";
+$file = $base;
+$ext = 1;
+
+while (-e $file) {
+	$file = "$base.$ext";
+	$ext++;
+}
+warn "Saving in $file\n";
+open(FD, ">$file");
+while(<>) {
+	print FD;
+}
+exit 0;
diff --git a/performance/lmbench3/scripts/stats b/performance/lmbench3/scripts/stats
new file mode 100755
index 0000000..0b60667
--- /dev/null
+++ b/performance/lmbench3/scripts/stats
@@ -0,0 +1,50 @@
+
+# Convert the Y coordinate to an average
+
+eval "exec perl -sS $0 $*"
+	if 0;
+
+@values = ();
+$sum = $n = 0;
+$min = 1.7E+308;
+$max = 2.2E-308;
+while (<>) {
+	next if /^[%#]/;
+	split;
+	if ($_[0] > 1000000) {
+		#warn "$file: ignoring $_";
+		next;
+	}
+	if ($#_ >= 1) {
+		$val = $_[1];
+	} else {
+		$val = $_[0];
+	}
+	push(@values, $val);
+	$sum += $val;
+	$min = $val if $val < $min;
+	$max = $val if $val > $max;
+	$n++;
+}
+# Do some statistics.
+@s = sort(@values);
+if ($n & 1) {	
+	$median = $s[($n + 1)/2];
+} else {
+	$i = $n / 2;
+	$median = ($s[$i] + $s[$i+1]) / 2;
+}
+$avg = $sum/$n;
+$avgdev = $var = 0;
+foreach $_ (@values) {
+	$var += ($_ - $median) ** 2;
+	$tmp = $_ - $median;
+	$avgdev += $tmp > 0 ? $tmp : -$tmp;
+}
+$var /= $n - 1;
+$stddev = sqrt($var);
+$avgdev /= $n;
+#printf("%8s %8s %8s %8s %8s %4s %8s\n", "Min", "Max", "Average", "Median", "Std Dev", "%", "Avg Dev");
+#printf "%8.2f %8.2f %8.2f %8.2f %8.2f %4.1f%% %8.2f\n", $min, $max, $avg, $median, $stddev, $stddev/$median*100, $avgdev;
+printf "%4.1f%%\n", $stddev/$median*100;
+exit 0;
diff --git a/performance/lmbench3/scripts/statsummary b/performance/lmbench3/scripts/statsummary
new file mode 100755
index 0000000..21e6266
--- /dev/null
+++ b/performance/lmbench3/scripts/statsummary
@@ -0,0 +1,1075 @@
+
+# Generate an ascii summary from lmbench result files BY HOSTNAME
+# instead of architecture.  Sorry, I think of these tools as being
+# used to measure and prototype particular named systems, not as
+# being useful to measure once and for all "i686-linux" systems,
+# which might well have different motherboards, chipsets, memory
+# clocks, CPU's (anything from PPro through to PIII so far) and
+# so forth.  Linux systems are far to heterogeneous to be categorized
+# with two or three descriptors, so might as well just use hostname
+# for shorthand...
+#
+# Usage: statsummary file file file...
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+#
+# $Id: statsummary,v 1.5 2000/07/08 21:06:49 rgb Exp $
+#
+#
+# Edit History.  I'm starting out with Larry's getsummary.  Then I'm
+# going to splice in a very simple set of stats routines that are
+# passed an array in his standard form and return a structure containing
+# max, min, mean, median, unbiased standard deviation and we'll go from
+# there.  However I'll likely print out only mean and SD and will try
+# to preserve Larry's general layout at that.  Oh, and I'm going to add
+# COMMENTS to the script.  Drives me nuts to work on something without
+# comments.  7/6/00
+
+eval 'exec perl -Ssw $0 "$@"'
+  if 0;
+
+#
+# This segment loops through all the output files and pushes the
+# specific field values it needs into suitably named arrays.  It
+# counts while it does so so it can check to be sure that all
+# the input files are complete.
+$n = 0;
+@hosts = ();
+foreach $file (@ARGV) {
+  open(FD, $file) || die "$0: can't open $file";
+  # I just want @file to contain the hostname, not the path or architecture.
+  # However, we have reason to need the associated filename (no path) to
+  # to help with debugging.
+  # Strip off the path
+  $file =~ s/(.*)\///;
+  # Split the filename from the number.  This will probably break if the
+  # hostname contains more "."'s.  However, I'm too lazy to figure out
+  # how to make this work totally robustly.  It would be easy if the
+  # the host datafiles were all created according to the "hostname.count"
+  # format, because then a simple regexp would pull off just the hostname
+  # or the count.  Not so easy when a hostname/count might contain no "."'s
+  # at all...
+  $filecount = "";
+  ($file,$filecount) = split(/\./,$file);
+  # fix silly bug caused by starting numbering at blank.
+  if(! $filecount){ 
+    $filecount = 0;
+  }
+  # Debugging...
+  # print STDERR "Found file $file with count $filecount\n";
+  push(@file, $file);
+  push(@filecount, $filecount);
+
+  # This should just push UNIQUE new hosts onto @hosts.
+  $numhosts = @hosts;
+  if($numhosts){ 
+    $lasthost = $hosts[$numhosts-1];
+  } else {
+    $lasthost = "";
+  }
+  if($lasthost !~ /$file/){
+    push(@hosts, $file);
+  }
+
+  $mhz = 0;
+  while (<FD>) {
+    chop;
+    next if m|scripts/lmbench: /dev/tty|;
+    if (/^\[lmbench/) {
+      push(@uname, $_);
+      if (/lmbench1\./) {
+        $version = 1;
+      } else {
+        $version = 2;
+      }
+    }
+    if (/MHZ/ && !$mhz) {
+      @_ = split;
+      $_[1] =~ s/\]//;
+      push(@misc_mhz, $_[1]);
+      $mhz = 1;
+    } elsif (/Mhz/ && !$mhz) {
+      @_ = split;
+      push(@misc_mhz, $_[0]);
+      $mhz = 1;
+    }
+    if (/^Select on 100 fd/) {
+      @_ = split;
+      push(@lat_select, $_[4]);
+      $tmp = $lat_select[0];	# Just to shut up the error parser
+    }
+    if (/^Simple syscall:/) {
+      @_ = split;
+      push(@lat_syscall, $_[2]);
+      $tmp = $lat_syscall[0];	# Just to shut up the error parser
+    }
+    if (/^Simple read:/) {
+      @_ = split;
+      push(@lat_read, $_[2]);
+      $tmp = $lat_read[0];	# Just to shut up the error parser
+    }
+    if (/^Simple write:/) {
+      @_ = split;
+      push(@lat_write, $_[2]);
+      $tmp = $lat_write[0];	# Just to shut up the error parser
+    }
+    if (/^Simple stat:/) {
+      @_ = split;
+      push(@lat_stat, $_[2]);
+      $tmp = $lat_stat[0];	# Just to shut up the error parser
+    }
+    if (/^Simple open.close:/) {
+      @_ = split;
+      push(@lat_openclose, $_[2]);
+      $tmp = $lat_openclose[0];	# Just to shut up the error parser
+    }
+    if (/^Null syscall:/) {  # Old format.
+      @_ = split;
+      push(@lat_write, $_[2]);
+      $tmp = $lat_write[0];	# Just to shut up the error parser
+    }
+    if (/^Signal handler installation:/) {
+      @_ = split;
+      push(@lat_siginstall, $_[3]);
+      $tmp = $lat_siginstall[0];	# Just to shut up the error parser
+    }
+    if (/^Signal handler overhead:/) {
+      @_ = split;
+      push(@lat_sigcatch, $_[3]);
+      $tmp = $lat_sigcatch[0];	# Just to shut up the error parser
+    }
+    if (/^Protection fault:/) {
+      @_ = split;
+      push(@lat_protfault, $_[2]);
+      $tmp = $lat_protfault[0];	# Just to shut up the error parser
+    }
+    if (/^Pipe latency:/) {
+      @_ = split;
+      push(@lat_pipe, $_[2]);
+      $tmp = $lat_pipe[0];	# Just to shut up the error parser
+    }
+    if (/AF_UNIX sock stream latency:/) {
+      @_ = split;
+      push(@lat_unix, $_[4]);
+      $tmp = $lat_unix[0];	# Just to shut up the error parser
+    }
+    if (/^UDP latency using /) {
+      if(/localhost:/) {
+        @_ = split;
+        push(@lat_udp_local, $_[4]);
+        $tmp = $lat_udp_local[0];	# Just to shut up the error parser
+      } else {
+        @_ = split;
+        push(@lat_udp_net, $_[4]);
+        $tmp = $lat_udp_net[0];	# Just to shut up the error parser
+      }
+    }
+    if (/^TCP latency using /) {
+      if(/localhost:/) {
+        @_ = split;
+        push(@lat_tcp_local, $_[4]);
+        $tmp = $lat_tcp_local[0];	# Just to shut up the error parser
+      } else {
+        @_ = split;
+        push(@lat_tcp_net, $_[4]);
+        $tmp = $lat_tcp_net[0];	# Just to shut up the error parser
+      }
+    }
+    if (/^RPC\/udp latency using /) {
+      if(/localhost:/) {
+        @_ = split;
+        push(@lat_rpc_udp_local, $_[4]);
+        $tmp = $lat_rpc_udp_local[0];	# Just to shut up the error parser
+      } else {
+        @_ = split;
+        push(@lat_rpc_udp_net, $_[4]);
+        $tmp = $lat_rpc_udp_net[0];	# Just to shut up the error parser
+      }
+    }
+    if (/^RPC\/tcp latency using /) {
+      if(/localhost:/) {
+        @_ = split;
+        push(@lat_rpc_tcp_local, $_[4]);
+        $tmp = $lat_rpc_tcp_local[0];	# Just to shut up the error parser
+      } else {
+        @_ = split;
+        push(@lat_rpc_tcp_net, $_[4]);
+        $tmp = $lat_rpc_tcp_net[0];	# Just to shut up the error parser
+      }
+    }
+    if (/^TCP\/IP connection cost to /) {
+      if(/localhost:/) {
+        @_ = split;
+        push(@lat_tcp_connect_local, $_[5]);
+        $tmp = $lat_tcp_connect_local[0];	# Just to shut up the error parser
+      } else {
+        @_ = split;
+        push(@lat_tcp_connect_net, $_[5]);
+        $tmp = $lat_tcp_connect_net[0];	# Just to shut up the error parser
+      }
+    }
+    if (/^Socket bandwidth using /) {
+      if(/localhost:/) {
+        @_ = split;
+        push(@bw_tcp_local, $_[4]);
+        $tmp = $bw_tcp_local[0];	# Just to shut up the error parser
+      } else {
+        @_ = split;
+        push(@bw_tcp_net, $_[4]);
+        $tmp = $bw_tcp_net[0];	# Just to shut up the error parser
+      }
+    }
+    if (/^AF_UNIX sock stream bandwidth:/) {
+      @_ = split;
+      push(@bw_unix, $_[4]);
+      $tmp = $bw_unix[0];	# Just to shut up the error parser
+    }
+    if (/^Process fork.exit/) {
+      @_ = split;
+      push(@lat_nullproc, $_[2]);
+      $tmp = $lat_nullproc[0];	# Just to shut up the error parser
+    }
+    if (/^Process fork.execve:/) {
+      @_ = split;
+      push(@lat_simpleproc, $_[2]);
+      $tmp = $lat_simpleproc[0];	# Just to shut up the error parser
+    }
+    if (/^Process fork..bin.sh/) {
+      @_ = split;
+      push(@lat_shproc, $_[3]);
+      $tmp = $lat_shproc[0];	# Just to shut up the error parser
+    }
+    if (/^Pipe bandwidth/) {
+      @_ = split;
+      push(@bw_pipe, $_[2]);
+      $tmp = $bw_pipe[0];	# Just to shut up the error parser
+    }
+    if (/^File .* write bandwidth/) {
+      @_ = split;
+      $bw = sprintf("%.2f", $_[4] / 1024.);
+      push(@bw_file, $bw);
+      $tmp = $bw_file[0];	# Just to shut up the error parser
+    }
+    if (/^Pagefaults on/) {
+      @_ = split;
+      push(@lat_pagefault, $_[3]);
+      $tmp = $lat_pagefault[0];	# Just to shut up the error parser
+    }
+    if (/^"mappings/) {
+      $value = &getbiggest("memory mapping timing");
+      push(@lat_mappings, $value);
+      $tmp = $lat_mappings[0];	# Just to shut up the error parser
+    }
+    if (/^"read bandwidth/) {
+      $value = &getbiggest("reread timing");
+      push(@bw_reread, $value);
+      $tmp = $bw_reread[0];	# Just to shut up the error parser
+    }
+    if (/^"Mmap read bandwidth/) {
+      $value = &getbiggest("mmap reread timing");
+      push(@bw_mmap, $value);
+      $tmp = $bw_mmap[0];	# Just to shut up the error parser
+    }
+    if (/^"libc bcopy unaligned/) {
+      $value = &getbiggest("libc bcopy timing");
+      push(@bw_bcopy_libc, $value);
+      $tmp = $bw_bcopy_libc[0];	# Just to shut up the error parser
+    }
+    if (/^"unrolled bcopy unaligned/) {
+      $value = &getbiggest("unrolled bcopy timing");
+      push(@bw_bcopy_unrolled, $value);
+      $tmp = $bw_bcopy_unrolled[0];	# Just to shut up the error parser
+    }
+    if (/^Memory read/) {
+      $value = &getbiggest("memory read & sum timing");
+      push(@bw_mem_rdsum, $value);
+      $tmp = $bw_mem_rdsum[0];	# Just to shut up the error parser
+    }
+    if (/^Memory write/) {
+      $value = &getbiggest("memory write timing");
+      push(@bw_mem_wr, $value);
+      $tmp = $bw_mem_wr[0];	# Just to shut up the error parser
+    }
+    if (/^"File system latency/) {
+      while (<FD>) {
+        next if /Id:/;
+        if (/^0k/) {
+          @_ = split;
+          push(@fs_create_0k, $_[2]);
+          push(@fs_delete_0k, $_[3]);
+          $tmp = $fs_create_0k[0];	# Just to shut up the error parser
+          $tmp = $fs_delete_0k[0];	# Just to shut up the error parser
+        } elsif (/^1k/) {
+          @_ = split;
+          push(@fs_create_1k, $_[2]);
+          push(@fs_delete_1k, $_[3]);
+          $tmp = $fs_create_1k[0];	# Just to shut up the error parser
+          $tmp = $fs_delete_1k[0];	# Just to shut up the error parser
+        } elsif (/^4k/) {
+          @_ = split;
+          push(@fs_create_4k, $_[2]);
+          push(@fs_delete_4k, $_[3]);
+          $tmp = $fs_create_4k[0];	# Just to shut up the error parser
+          $tmp = $fs_delete_4k[0];	# Just to shut up the error parser
+        } elsif (/^10k/) {
+          @_ = split;
+          push(@fs_create_10k, $_[2]);
+          push(@fs_delete_10k, $_[3]);
+          $tmp = $fs_create_10k[0];	# Just to shut up the error parser
+          $tmp = $fs_delete_10k[0];	# Just to shut up the error parser
+        } else {
+          last;
+        }
+      }
+    }
+    if (/size=0/) {
+      while (<FD>) {
+        if (/^2 /) {
+          @_ = split; push(@lat_ctx0_2, $_[1]);
+          $tmp = $lat_ctx0_2[0];	# Just to shut up the error parser
+        } elsif (/^8 /) {
+          @_ = split; push(@lat_ctx0_8, $_[1]);
+          $tmp = $lat_ctx0_8[0];	# Just to shut up the error parser
+        } elsif (/^16 /) {
+          @_ = split; push(@lat_ctx0_16, $_[1]);
+          $tmp = $lat_ctx0_16[0];	# Just to shut up the error parser
+        }
+            last if /^\s*$/ || /^Memory/;
+      }
+    }
+    if (/size=16/) {
+      while (<FD>) {
+        if (/^2 /) {
+          @_ = split; push(@lat_ctx16_2, $_[1]);
+          $tmp = $lat_ctx16_2[0];	# Just to shut up the error parser
+        } elsif (/^8 /) {
+          @_ = split; push(@lat_ctx16_8, $_[1]);
+          $tmp = $lat_ctx16_8[0];	# Just to shut up the error parser
+        } elsif (/^16 /) {
+          @_ = split; push(@lat_ctx16_16, $_[1]);
+          $tmp = $lat_ctx16_16[0];	# Just to shut up the error parser
+        }
+            last if /^\s*$/;
+      }
+    }
+    if (/size=64/) {
+      while (<FD>) {
+        if (/^2 /) {
+          @_ = split; push(@lat_ctx64_2, $_[1]);
+          $tmp = $lat_ctx64_2[0];	# Just to shut up the error parser
+        } elsif (/^8 /) {
+          @_ = split; push(@lat_ctx64_8, $_[1]);
+          $tmp = $lat_ctx64_8[0];	# Just to shut up the error parser
+        } elsif (/^16 /) {
+          @_ = split; push(@lat_ctx64_16, $_[1]);
+          $tmp = $lat_ctx64_16[0];	# Just to shut up the error parser
+        }
+            last if /^\s*$/ || /^20/;
+      }
+    }
+    if (/^"stride=128/) {
+      $save = -1;
+      while (<FD>) {
+        if (/^0.00098\s/) {
+          @_ = split;
+          push(@lat_l1, $_[1]);
+          $tmp = $lat_l1[0];	# Just to shut up the error parser
+        } elsif (/^0.12500\s/) {
+          @_ = split;
+          push(@lat_l2, $_[1]);
+          $tmp = $lat_l2[0];	# Just to shut up the error parser
+        } elsif (/^[45678].00000\s/) {
+          @_ = split;
+          $size = $_[0];
+          $save = $_[1];
+          last if /^8.00000\s/;
+        } elsif (/^\s*$/) {
+          last;
+        }
+      }
+      if (!/^8/) {
+        warn "$file: No 8MB memory latency, using $size\n";
+      }
+      push(@lat_mem, $save);
+    }
+  }
+  @warn = ();
+  foreach $array (
+    'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_file',
+    'bw_mem_rdsum', 'bw_mem_wr', 'bw_mmap', 'bw_pipe',
+    'bw_reread', 'bw_tcp_local', 'bw_unix', 
+    'fs_create_0k','fs_delete_0k',
+    'fs_create_1k','fs_delete_1k',
+    'fs_create_4k','fs_delete_4k',
+    'fs_create_10k','fs_delete_10k',
+    'lat_ctx0_16', 'lat_ctx0_2', 'lat_ctx0_8',
+    'lat_ctx16_16', 'lat_ctx16_2', 'lat_ctx16_8',
+    'lat_ctx64_16', 'lat_ctx64_2', 'lat_ctx64_8', 'lat_l1',
+    'lat_l2', 'lat_mappings', 'lat_mem', 'lat_nullproc',
+    'lat_openclose', 'lat_pagefault', 'lat_pipe',
+    'lat_protfault', 'lat_read', 
+    'lat_rpc_tcp_local','lat_rpc_udp_local',
+    'lat_tcp_connect_local', 'lat_tcp_local', 'lat_udp_local', 
+    'lat_rpc_tcp_net','lat_rpc_udp_net',
+    'lat_tcp_connect_net', 'lat_tcp_net', 'lat_udp_net', 
+    'lat_select', 'lat_shproc', 'lat_sigcatch',    
+    'lat_siginstall', 'lat_simpleproc', 'lat_stat',
+    'lat_syscall', 'lat_unix', 'lat_write', 'misc_mhz',
+  ) {
+    $last = eval '$#' . $array;
+    if ($last != $n) {
+      #warn "No data for $array in $file\n";
+      push(@warn, $array);
+      eval 'push(@' . $array . ', -1);';
+    }
+  }
+  if ($#warn != -1) {
+    warn "Missing data in $file: @warn\n";
+  }
+  $n++;
+}
+
+#
+# OK, now all those arrays are packed.  Because everything is keyed
+# on raw hostname, we can do all the stats evaluations using a combination
+# of @file and the array -- we march through @file and create a stats
+# object (a % hash) with its name and do the obvious sums and so forth.
+# should be very simple.
+#
+# However, to be fair to Larry, we do want to preserve the general flavor
+# of the summary.  However, the summary is now going to be output BY HOST
+# and so we need a separate host-description section for each host.
+#
+# First we have to evaluate the stats, though.
+#
+
+#
+# Let's test this with just one small set of values...
+foreach $array (
+    'bw_bcopy_libc', 'bw_bcopy_unrolled', 'bw_file',
+    'bw_mem_rdsum', 'bw_mem_wr', 'bw_mmap', 'bw_pipe',
+    'bw_reread', 'bw_tcp_local', 'bw_unix', 
+    'fs_create_0k','fs_delete_0k',
+    'fs_create_1k','fs_delete_1k',
+    'fs_create_4k','fs_delete_4k',
+    'fs_create_10k','fs_delete_10k',
+    'lat_l1',
+    'lat_l2', 'lat_mappings', 'lat_mem', 'lat_nullproc',
+    'lat_openclose', 'lat_pagefault', 'lat_pipe',
+    'lat_protfault', 'lat_read', 
+    'lat_rpc_tcp_local','lat_rpc_udp_local',
+    'lat_tcp_connect_local', 'lat_tcp_local', 'lat_udp_local', 
+    'lat_rpc_tcp_net','lat_rpc_udp_net',
+    'lat_tcp_connect_net', 'lat_tcp_net', 'lat_udp_net', 
+    'lat_select', 'lat_shproc', 'lat_sigcatch',    
+    'lat_siginstall', 'lat_simpleproc', 'lat_stat',
+    'lat_syscall', 'lat_unix', 'lat_write', 'misc_mhz',
+  ) { }	# Empty just to save the full list someplace handy.
+
+#
+# Oops.  For some unfathomable reason lat_fs returns something other than
+# an (average) time in nanoseconds.  Why, I cannot imagine -- one could 
+# trivially invert so that it did so.  One CANNOT DO STATS on inverse
+# quantities, so we invert here and convert to nanoseconds 
+# so we can correctly do stats below.
+foreach $array (
+    'fs_create_0k','fs_delete_0k','fs_create_1k','fs_delete_1k',
+    'fs_create_4k','fs_delete_4k','fs_create_10k','fs_delete_10k',
+    ) {
+  $cnt = 0;
+  foreach $entry (@$array){
+    $$array[$cnt++] = 1.0e+9/$entry;
+  }
+
+}
+
+# Working copy.  Let's just add things as they turn out to be
+# appropriate.  In fact, we'll add them in presentation order!
+foreach $array (
+    'lat_syscall','lat_read', 'lat_write', 'lat_syscall', 'lat_stat', 
+    'lat_openclose','lat_select','lat_siginstall','lat_sigcatch',
+    'lat_nullproc','lat_simpleproc','lat_shproc',
+    'lat_ctx0_2','lat_ctx0_16','lat_ctx0_8',
+    'lat_ctx16_16','lat_ctx16_2','lat_ctx16_8',
+    'lat_ctx64_16','lat_ctx64_2','lat_ctx64_8', 
+    'lat_pipe','lat_unix',
+    'lat_udp_local','lat_tcp_local',lat_tcp_connect_local,
+    'lat_rpc_udp_local','lat_rpc_tcp_local',
+    'lat_udp_net','lat_tcp_net',lat_tcp_connect_net,
+    'lat_rpc_udp_net','lat_rpc_tcp_net',
+    'fs_create_0k','fs_delete_0k',
+    'fs_create_1k','fs_delete_1k',
+    'fs_create_4k','fs_delete_4k',
+    'fs_create_10k','fs_delete_10k',
+    'lat_mappings','lat_protfault','lat_pagefault',
+    'bw_pipe','bw_unix',
+    'bw_tcp_local',	# Note we need bw_udp_local as soon as it exists...
+    'bw_reread','bw_mmap','bw_bcopy_libc','bw_bcopy_unrolled',
+    'bw_mem_rdsum','bw_mem_wr',
+    'bw_tcp_net',
+    'lat_l1','lat_l2','lat_mem',
+    ) {
+
+  #
+  # This should do it all, by name and collapsed by hostname
+  #
+  makestats($array);
+
+}
+
+#
+# Fine, that seems to work.  Now we break up the summary, BY HOST.
+# For each host we print just ONE TIME key values that don't really
+# vary (like its architecture information and clock).  Then we print
+# out a modified version of Larry's old summary.
+#
+
+#
+# First the header
+#
+print<<EOF;
+========================================================================
+
+                 L M B E N C H  3 . 0   S U M M A R Y
+                 ------------------------------------
+
+========================================================================
+
+EOF
+
+#
+# Now a host loop.  Notice that @hosts is a list of hosts
+#
+$numhosts = @hosts;
+for($i=0;$i<$numhosts;$i++){
+  $host = $hosts[$i];
+  # Obviously we need a better way to fill in this information.
+  # Linux provides /proc/cpuinfo, which is just perfect and trivial
+  # to parse.  However, we should probably read this in from e.g.
+  # config/$host.conf, which can be created either automagically or
+  # by hand.  This file should also be used to control the running
+  # of the benchmark suite, which in turn should be done by means of
+  # a script call, not a make target.  I'm getting there...
+  #
+  # Oh, one last note.  It would be VERY CONVENIENT to have the config
+  # information stored in perl.  So convenient that the following should
+  # BE the format of the config file... (up to the next comment)
+  $CPU = "Celeron(Mendocino)";
+  $CPUFAMILY = "i686";
+  $MHz = 400;
+  $L1CODE = 16;
+  $L1DATA = 16;
+  $L2SIZE = 128;
+  $memsize = 128;
+  $memspeed = "PC100";
+  $memtype = "SDRAM";
+  @DISKS = ("/dev/hda","/dev/hdb","/dev/hdc");
+  @DISKTYPE = ("IBM-DJNA-371350, ATA DISK drive", "Disk 2", "Disk etc.");
+  @NETWORKS = ("ethernet-100","SneakerNet @ 3 meters/second");
+  @NICTYPE = ("Lite-On 82c168 PNIC rev 32","Nike Sports (etc.)");
+  @NETHUB = ("Netgear FS108 Fast Ethernet Switch","The Floor");
+  #
+  # OK, given this wealth of detail (which can be sourced directly into
+  # the perl script from the host config file if we are clever) we now
+  # print it into the report/summary.
+  #
+  printf("HOST:\t\t$host\n");
+  printf("CPU:\t\t$CPU\n");
+  printf("CPU Family:\t$CPUFAMILY\n");
+  printf("MHz:\t\t$MHz\n");
+  printf("L1 Cache Size:\t$L1CODE KB (code)/$L1DATA KB (data)\n");
+  printf("L2 Cache Size:\t$L2SIZE KB\n");
+  printf("Memory:\t\t$memsize MB of $memspeed $memtype\n");
+  printf("OS Kernel:\t%13s\n",&getos($uname[0]));
+  printf("Disk(s):\n");
+  $numdisks = @DISKS;
+  for($j=0;$j<$numdisks;$j++){
+    printf("\t\t%d) %s: %s\n",$j+1,$DISKS[$j],$DISKTYPE[$j]);
+  }
+  printf("Network(s):\n");
+  $numnets = @NETWORKS;
+  for($j=0;$j<$numnets;$j++){
+    printf("\t\t%d) %s: %s\n",$j+1,$NETWORKS[$j],$NICTYPE[$j]);
+    printf("\t\t   Switch/Hub: %s\n",$NETHUB[$j]);
+  }
+  print<<EOF;
+                   
+
+------------------------------------------------------------------------
+Processor, Processes - average times in microseconds - smaller is better
+------------------------------------------------------------------------
+ null           null                         open/
+ call  Error    I/O   Error    stat  Error   close  Error
+------ ------  ------ ------  ------ ------  ------ ------
+EOF
+
+#
+# In all the output below, averaged arrays are accessed by the hash:
+#  $stats{$host}{$array}{mean or stddev} (or whatever)
+
+  @fs_delete_4k = @lat_ctx0_8 = @bw_file = @lat_ctx0_16 = @fs_delete_1k =
+  @fs_create_4k = @fs_create_1k
+    if 0;  # lint
+
+  # If they have no /dev/zero, use /dev/null, else average them.
+  if ($stats{$host}{lat_read}{mean} == -1) {
+    $lat_rw_mean = $stats{$host}{lat_write}{mean};
+    $lat_rw_stddev = $stats{$host}{lat_write}{stddev};
+  } else {
+    $lat_rw_mean = ($stats{$host}{lat_read}{mean} + $stats{$host}{lat_write}{mean})/2;
+    $lat_rw_stddev = ($stats{$host}{lat_read}{stddev} + $stats{$host}{lat_write}{stddev})/2;
+  }
+  # We have to pick a format adequate for these numbers.  We'll shoot for
+  # %5.2f and see how it goes.
+  printf("%6.3f %6.3f  ",$stats{$host}{lat_syscall}{mean},$stats{$host}{lat_syscall}{stddev});
+  printf("%6.3f %6.3f  ",$lat_rw_mean,$lat_rw_stddev);
+  printf("%6.3f %6.3f  ",$stats{$host}{lat_stat}{mean},$stats{$host}{lat_stat}{stddev});
+  printf("%6.3f %6.3f  ",$stats{$host}{lat_openclose}{mean},$stats{$host}{lat_openclose}{stddev});
+  # End with this to complete the line...
+  printf("\n");
+  print<<EOF;
+........................................................................
+               signal         signal
+select Error   instll Error   catch  Error
+------ ------  ------ ------  ------ ------
+EOF
+  printf("%6.1f %6.2f  ",$stats{$host}{lat_select}{mean},$stats{$host}{lat_select}{stddev});
+  printf("%6.3f %6.3f  ",$stats{$host}{lat_siginstall}{mean},$stats{$host}{lat_siginstall}{stddev});
+  printf("%6.3f %6.3f  ",$stats{$host}{lat_sigcatch}{mean},$stats{$host}{lat_sigcatch}{stddev});
+  # End with this to complete the line...
+  printf("\n");
+  print<<EOF;
+........................................................................
+ fork             exec             shell
+ proc    Error    proc    Error    proc    Error
+------- -------  ------- -------  ------- -------
+EOF
+  printf("%7.1f %7.2f  ",
+         $stats{$host}{lat_nullproc}{mean},$stats{$host}{lat_nullproc}{stddev});
+  printf("%7.1f %7.2f  ",
+         $stats{$host}{lat_simpleproc}{mean},$stats{$host}{lat_simpleproc}{stddev});
+  printf("%7.1f %7.2f  ",
+         $stats{$host}{lat_shproc}{mean},$stats{$host}{lat_shproc}{stddev});
+  # End with this to complete the line...
+  printf("\n");
+  print<<EOF;
+                   
+
+------------------------------------------------------------------------
+Context switching - times in microseconds - smaller is better
+------------------------------------------------------------------------
+2p/0K          2p/16K         2p/64K
+       Error          Error          Error
+------ ------  ------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_ctx0_2}{mean},$stats{$host}{lat_ctx0_2}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_ctx16_2}{mean},$stats{$host}{lat_ctx16_2}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_ctx64_2}{mean},$stats{$host}{lat_ctx64_2}{stddev});
+  # End with this to complete the line...
+  printf("\n");
+  print<<EOF;
+........................................................................
+8p/0K          8p/16K         8p/64K       
+       Error          Error          Error 
+------ ------  ------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_ctx0_8}{mean},$stats{$host}{lat_ctx16_8}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_ctx16_8}{mean},$stats{$host}{lat_ctx16_8}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_ctx64_8}{mean},$stats{$host}{lat_ctx64_8}{stddev});
+  # End with this to complete the line...
+  printf("\n");
+  print<<EOF;
+........................................................................
+16p/0K        16p/16K        16p/64K       
+       Error          Error          Error 
+------ ------  ------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_ctx0_16}{mean},$stats{$host}{lat_ctx0_16}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_ctx16_16}{mean},$stats{$host}{lat_ctx16_16}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_ctx64_16}{mean},$stats{$host}{lat_ctx64_16}{stddev});
+  # End with this to complete the line...
+  printf("\n");
+  print<<EOF;
+
+------------------------------------------------------------------------
+*Local* Communication latencies in microseconds - smaller is better
+------------------------------------------------------------------------
+ Pipe            AF
+       Error    UNIX  Error
+------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_pipe}{mean},$stats{$host}{lat_pipe}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_unix}{mean},$stats{$host}{lat_unix}{stddev});
+  printf("\n");
+  print<<EOF;
+........................................................................
+ UDP            TCP            TCP           
+       Error          Error  Connect Error
+------ ------  ------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_udp_local}{mean},$stats{$host}{lat_udp_local}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_tcp_local}{mean},$stats{$host}{lat_tcp_local}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_tcp_connect_local}{mean},$stats{$host}{lat_tcp_connect_local}{stddev});
+  printf("\n");
+  print<<EOF;
+........................................................................
+ RPC            RPC
+ UDP   Error    TCP Error
+------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_rpc_udp_local}{mean},$stats{$host}{lat_rpc_udp_local}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_rpc_tcp_local}{mean},$stats{$host}{lat_rpc_tcp_local}{stddev});
+  printf("\n");
+  print<<EOF;
+
+------------------------------------------------------------------------
+*Network* Communication latencies in microseconds - smaller is better
+------------------------------------------------------------------------
+ UDP            TCP            TCP           
+       Error          Error  Connect Error
+------ ------  ------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_udp_net}{mean},$stats{$host}{lat_udp_net}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_tcp_net}{mean},$stats{$host}{lat_tcp_net}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_tcp_connect_net}{mean},$stats{$host}{lat_tcp_connect_net}{stddev});
+  printf("\n");
+  print<<EOF;
+........................................................................
+ RPC            RPC
+ UDP   Error    TCP Error
+------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_rpc_udp_net}{mean},$stats{$host}{lat_rpc_udp_net}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_rpc_tcp_net}{mean},$stats{$host}{lat_rpc_tcp_net}{stddev});
+  printf("\n");
+  print<<EOF;
+
+------------------------------------------------------------------------
+File & VM system latencies in microseconds - smaller is better
+------------------------------------------------------------------------
+             0k File                           1K File
+Create   Error   Delete   Error   Create   Error   Delete   Error
+------- -------  ------- -------  ------- -------  ------- -------
+EOF
+  $c0k =    $stats{$host}{fs_create_0k}{mean} <= 0 ? -1 : $stats{$host}{fs_create_0k}{mean}/1000;
+  $c0kerr = $stats{$host}{fs_create_0k}{stddev} <= 0 ? -1 : $stats{$host}{fs_create_0k}{stddev}/1000;
+  $d0k =    $stats{$host}{fs_delete_0k}{mean} <= 0 ? -1 : $stats{$host}{fs_delete_0k}{mean}/1000;
+  $d0kerr = $stats{$host}{fs_delete_0k}{stddev} <= 0 ? -1 : $stats{$host}{fs_delete_0k}{stddev}/1000;
+  $c1k =    $stats{$host}{fs_create_1k}{mean} <= 0 ? -1 : $stats{$host}{fs_create_1k}{mean}/1000;
+  $c1kerr = $stats{$host}{fs_create_1k}{stddev} <= 0 ? -1 : $stats{$host}{fs_create_1k}{stddev}/1000;
+  $d1k =    $stats{$host}{fs_delete_1k}{mean} <= 0 ? -1 : $stats{$host}{fs_delete_1k}{mean}/1000;
+  $d1kerr = $stats{$host}{fs_delete_1k}{stddev} <= 0 ? -1 : $stats{$host}{fs_delete_1k}{stddev}/1000;
+  printf("%7.2f %7.3f  ",
+         $c0k,$c0kerr);
+  printf("%7.2f %7.3f  ",
+         $d0k,$d0kerr);
+  printf("%7.2f %7.3f  ",
+         $c1k,$c1kerr);
+  printf("%7.2f %7.3f  ",
+         $d1k,$d1kerr);
+  printf("\n");
+  print<<EOF;
+........................................................................
+             4k File                          10K File
+Create   Error   Delete   Error   Create   Error   Delete   Error
+------- -------  ------- -------  ------- -------  ------- -------
+EOF
+  $c4k =    $stats{$host}{fs_create_4k}{mean} <= 0 ? -1 : $stats{$host}{fs_create_4k}{mean}/1000;
+  $c4kerr = $stats{$host}{fs_create_4k}{stddev} <= 0 ? -1 : $stats{$host}{fs_create_4k}{stddev}/1000;
+  $d4k =    $stats{$host}{fs_delete_4k}{mean} <= 0 ? -1 : $stats{$host}{fs_delete_4k}{mean}/1000;
+  $d4kerr = $stats{$host}{fs_delete_4k}{stddev} <= 0 ? -1 : $stats{$host}{fs_delete_4k}{stddev}/1000;
+  $c10k =    $stats{$host}{fs_create_10k}{mean} <= 0 ? -1 : $stats{$host}{fs_create_10k}{mean}/1000;
+  $c10kerr = $stats{$host}{fs_create_10k}{stddev} <= 0 ? -1 : $stats{$host}{fs_create_10k}{stddev}/1000;
+  $d10k =    $stats{$host}{fs_delete_10k}{mean} <= 0 ? -1 : $stats{$host}{fs_delete_10k}{mean}/1000;
+  $d10kerr = $stats{$host}{fs_delete_10k}{stddev} <= 0 ? -1 : $stats{$host}{fs_delete_10k}{stddev}/1000;
+  printf("%7.2f %7.3f  ",
+         $c4k,$c4kerr);
+  printf("%7.2f %7.3f  ",
+         $d4k,$d4kerr);
+  printf("%7.2f %7.3f  ",
+         $c10k,$c10kerr);
+  printf("%7.2f %7.3f  ",
+         $d10k,$d10kerr);
+  printf("\n");
+  print<<EOF;
+........................................................................
+  Mmap              Prot              Page
+Latency   Error     Fault   Error     Fault   Error
+-------- --------  ------- -------  -------- --------
+EOF
+  printf("%8.2f %8.3f  ",
+         $stats{$host}{lat_mappings}{mean},$stats{$host}{lat_mappings}{stddev});
+  printf("%7.2f %7.3f  ",
+         $stats{$host}{lat_protfault}{mean},$stats{$host}{lat_protfault}{stddev});
+  printf("%8.2f %8.3f  ",
+         $stats{$host}{lat_pagefault}{mean},$stats{$host}{lat_pagefault}{stddev});
+  printf("\n");
+  print<<EOF;
+
+------------------------------------------------------------------------
+*Local* Communication bandwidths in MB/s - bigger is better
+------------------------------------------------------------------------
+ Pipe            AF
+       Error    UNIX  Error
+------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_pipe}{mean},$stats{$host}{bw_pipe}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_unix}{mean},$stats{$host}{bw_unix}{stddev});
+  printf("\n");
+  print<<EOF;
+........................................................................
+ UDP            TCP
+       Error          Error
+------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         -1,-1);
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_tcp_local}{mean},$stats{$host}{bw_tcp_local}{stddev});
+  printf("\n");
+  print<<EOF;
+........................................................................
+ File           Mmap          Bcopy          Bcopy
+reread Error   reread Error   (libc) Error   (hand) Error
+------ ------  ------ ------  ------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_reread}{mean},$stats{$host}{bw_reread}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_mmap}{mean},$stats{$host}{bw_mmap}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_bcopy_libc}{mean},$stats{$host}{bw_bcopy_libc}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_bcopy_unrolled}{mean},$stats{$host}{bw_bcopy_unrolled}{stddev});
+  printf("\n");
+  print<<EOF;
+........................................................................
+ Mem            Mem
+ read  Error   write  Error
+------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_mem_rdsum}{mean},$stats{$host}{bw_mem_rdsum}{stddev});
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_mem_wr}{mean},$stats{$host}{bw_mem_wr}{stddev});
+  printf("\n");
+  print<<EOF;
+
+------------------------------------------------------------------------
+*Net* Communication bandwidths in MB/s - bigger is better
+------------------------------------------------------------------------
+ UDP            TCP
+       Error          Error
+------ ------  ------ ------
+EOF
+  printf("%6.2f %6.3f  ",
+         -1,-1);
+  printf("%6.2f %6.3f  ",
+         $stats{$host}{bw_tcp_net}{mean},$stats{$host}{bw_tcp_net}{stddev});
+  printf("\n");
+  print<<EOF;
+
+------------------------------------------------------------------------
+Memory latencies in nanoseconds - smaller is better
+  (WARNING - may not be correct, check graphs)
+------------------------------------------------------------------------
+  L1             L2            Main 
+Cache  Error    Cache Error    mem   Error   Guesses
+------ ------  ------ ------  ------ ------  -------
+EOF
+  $msg = &check_caches;
+  if ($stats{$host}{lat_l1}{mean} < 0) {
+    printf("%6s %6s  ",
+         "------","------");
+    printf("%6s %6s  ",
+         "------","------");
+    printf("%6s %6s  ",
+         "------","------");
+    printf("%6s","Bad mhz?");
+  } else {
+    printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_l1}{mean},$stats{$host}{lat_l1}{stddev});
+    printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_l2}{mean},$stats{$host}{lat_l2}{stddev});
+    printf("%6.2f %6.3f  ",
+         $stats{$host}{lat_mem}{mean},$stats{$host}{lat_mem}{stddev});
+    print $msg if ($msg =~ /L/);
+  }
+  printf("\n");
+
+
+
+# This ends the host section...
+  print<<EOF;
+
+========================================================================
+EOF
+
+}
+
+exit 0;
+
+
+# (33, %3d)
+sub num
+{
+  local($val, $fmt) = @_;
+  local($str) = "";
+  local($i);
+
+  if ($val <= 0) {
+    $fmt =~ s/^.//;
+    while (length($fmt) > 1) { chop($fmt); }
+    for ($i = 0; $i < $fmt; $i++) {
+      $str .= " ";
+    }
+    return ($str);
+  }
+  $str = sprintf($fmt, $val);
+  $str;
+}
+
+# Input looks like
+# "benchmark name
+# size value
+# ....
+# <blank line>
+#
+# Return the biggest value before the blank line.
+sub getbiggest
+{
+  local($msg) = @_;
+  local($line) = 0;
+
+  undef $save;
+  $value = 0;
+  while (<FD>) {
+    $line++;
+    #warn "$line $_";
+    last if /^\s*$/;
+    $save = $_ if /^\d+\./;
+  }
+  if (defined $save) {
+    $_ = $save;
+    @d = split;
+    $value = $d[1];
+    if (int($d[0]) < 4) {
+      warn "$file: using $d[0] size for $msg\n";
+    }
+  } else {
+    warn "$file: no data for $msg\n";
+  }
+  $value;
+}
+
+
+# Try and create sensible names from uname -a output
+sub getos
+{
+        local(@info);
+
+        @info = split(/\s+/, $_[0]);
+        "$info[3] $info[5]";
+}
+
+# Return true if the values differe by less than 10%
+sub same
+{
+  local($a, $b) = @_;
+
+  if ($a > $b) {
+    $percent = (($a - $b) / $a) * 100;
+  } else {
+    $percent = (($b - $a) / $b) * 100;
+  }
+  return ($percent <= 20);
+}
+
+sub check_caches
+{
+  if (!&same($lat_l1[$i], $lat_l2[$i]) &&
+      &same($lat_l2[$i], $lat_mem[$i])) {
+    "    No L2 cache?";
+  } elsif (&same($lat_l1[$i], $lat_l2[$i])) {
+    "    No L1 cache?";
+  }
+}
+
+sub makestats
+{
+
+ my $cnt=0;
+ my $host;
+ # Debugging
+ # print STDERR "Ready to make stats for array $array\n";
+ # Zero the counters
+ $numhosts = @hosts;
+ for($i=0;$i<$numhosts;$i++){
+   $host = $hosts[$i];
+   $stats{$host}{$array}{mean} = 0.0;
+   $stats{$host}{$array}{stddev} = 0.0;
+   $stats{$host}{$array}{count} = 0;
+ }
+ # Loop through ALL DATA.  We use the hash to direct results to
+ # to the appropriate counters.
+ foreach $value (@$array){
+   $host = $file[$cnt];
+   if($$array[0] == -1){
+     $stats{$host}{$array}{mean} = -1;
+     $stats{$host}{$array}{stddev} = -1;
+     # Debugging (and curiosity)
+     print STDERR "Oops.  $array is empty.\n";
+     return;
+   }    
+   # Debugging
+   # print STDERR "$host/$array ($cnt): value is $value\n";
+   $stats{$host}{$array}{mean} += $value;
+   $stats{$host}{$array}{stddev} += $value*$value;
+   $stats{$host}{$array}{count}++;
+   $cnt++;
+ }
+ for($i=0;$i<$numhosts;$i++){
+   $host = $hosts[$i];
+   $cnt = $stats{$host}{$array}{count};
+   # Debugging Only
+   # print STDERR "Evaluating final mean/stddev of $cnt objects in $host/$array\n";
+   if($cnt>1) {
+     $stats{$host}{$array}{mean} = $stats{$host}{$array}{mean} / $cnt;
+     $stats{$host}{$array}{stddev} = sqrt(($stats{$host}{$array}{stddev} / $cnt 
+          - $stats{$host}{$array}{mean}*$stats{$host}{$array}{mean})/($cnt-1));
+   } elsif($cnt == 1) {
+     # Wish one could assign "infinity".  This probably breaks somewhere.
+     $stats{$host}{$array}{stddev} = 1.0e+1000;
+   } else {
+     # print STDERR "Error:  Cannot average 0 $array results on $host\n";
+   }
+
+   # Debugging Only.
+   # print STDERR "$host/$array (average): $stats{$host}{$array}{mean} +/- $stats{$host}{$array}{stddev}\n";
+ }
+
+}
diff --git a/performance/lmbench3/scripts/synchronize b/performance/lmbench3/scripts/synchronize
new file mode 100755
index 0000000..302db00
--- /dev/null
+++ b/performance/lmbench3/scripts/synchronize
@@ -0,0 +1,60 @@
+#!/bin/sh
+
+# %W% %@% Copyright (c) 1998 Larry McVoy.
+#
+# Usage: SYNC_PID=3 SYNC_MAX=20 synchronize /tmp/sync_dir
+#
+# Used to sync up a bunch of processes so that they can operate in lockstep
+# as much as possible.
+#
+# The first process to try and sync will mkdir(pathname) and create a named
+# pipe in the directory.  It also creates a file, pathname/$PID where pid
+# is not the process id, it is the process number.  The group of processes
+# must be numbered from 1..N and they must each know their number.  The Nth
+# process is the master.  Whereas all the other processes block, opening the
+# pipe, the master spins in a loop, waiting for pathname/1 .. pathname/N-1
+# to show up in the directory.  When they are all there, the master opens
+# the pipe for writing and all the other processes get woken up and leave.
+#
+# It is outside of this program, but the directory must not exist before the
+# synchronization.  So you typically rm -rf it well before trying to sync.
+
+if [ X$1 = X ]; then echo "Usage: $0 pathname"; exit 1; fi
+if [ X$SYNC_PID = X ]; then echo "Must set SYNC_PID"; exit 1; fi
+if [ X$SYNC_MAX = X ]; then echo "Must set SYNC_MAX"; exit 1; fi
+
+DIR=$1
+mkdir -p $DIR 2>/dev/null
+if [ ! -e $DIR/fifo ]
+then	mkfifo $DIR/fifo 2>/dev/null
+	chmod 666 $DIR/fifo 2>/dev/null
+fi
+
+# slaves just read the pipe
+if [ $SYNC_PID != $SYNC_MAX ]
+then	touch $DIR/$SYNC_PID
+	read x < $DIR/fifo
+	exit 0
+fi
+
+# Master waits for all the other processes to get there
+PIDS=""
+I=1
+while [ $I -lt $SYNC_MAX ]
+do	PIDS=" $I$PIDS"
+	I=`expr $I + 1`
+done
+while true
+do	GO=Y
+	for s in $PIDS
+	do	if [ ! -e $DIR/$s ]
+		then	GO=N
+		fi
+	done
+	if [ $GO = Y ]
+	then	# This assumes that all the processes will 
+		echo sleep 2 > $DIR/fifo &
+		exit 0
+	fi
+	msleep 250
+done
diff --git a/performance/lmbench3/scripts/target b/performance/lmbench3/scripts/target
new file mode 100755
index 0000000..77eee07
--- /dev/null
+++ b/performance/lmbench3/scripts/target
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+# Figure out the OS name if possible.
+#
+# Hacked into existence by Larry McVoy (lm@xxxxxxx now lm@xxxxxxx).
+# Copyright (c) 1994 Larry McVoy.  GPLed software.
+# $Id: target 1.3 00/01/31 15:29:43-08:00 lm@xxxxxxxxxxxxxxx $
+case `uname -s` in 
+	*HP-UX*)        echo hpux;;
+	*Linux*)        echo linux;;
+	*IRIX*)         echo irix;;
+	*AIX*)          echo aix;;
+	BSD/OS)         echo bsdi;;
+	*BSD*)          echo bsd;;
+	*OSF1*)         echo osf1;;
+	*ULTRIX*)       echo ultrix;;
+	*SunOS*)        case `uname -r` in 
+				4*)     echo sunos;;
+				5*)     echo solaris;;
+				*)      echo unknown;; 
+			esac;; 
+	*)              echo unknown;; 
+esac 
+exit 0
diff --git a/performance/lmbench3/scripts/version b/performance/lmbench3/scripts/version
new file mode 100755
index 0000000..879b700
--- /dev/null
+++ b/performance/lmbench3/scripts/version
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+# %W% %@%
+
+F="no_such_file"
+if [ -f version.h ]
+then	F=version.h
+else	if [ -f ../src/version.h ]
+	then	F=../src/version.h
+	else	if [ -f src/version.h ]
+		then	F=src/version.h
+		fi
+	fi
+fi
+if [ -f $F ]
+then	VERS=`egrep 'MAJOR|MINOR' $F | awk '{print $3}'` 
+	set `echo $VERS`
+	if [ $2 -lt 0 ] 
+	then	VERS=`echo $1$2 | sed s/-/alpha/`
+	else	VERS=`echo $VERS |sed 's/ /./'`
+	fi
+	VERS=lmbench-$VERS
+else	VERS=lmench-2-something
+fi
+echo $VERS
diff --git a/performance/lmbench3/scripts/xroff b/performance/lmbench3/scripts/xroff
new file mode 100755
index 0000000..d5acf20
--- /dev/null
+++ b/performance/lmbench3/scripts/xroff
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+# X previewer like groff/nroff scripts.
+groff -P -filename -P "| groff -Z -X -Tps $*" -X -Tps "$@" 
+exit 0
diff --git a/performance/lmbench3/src/Makefile b/performance/lmbench3/src/Makefile
new file mode 100644
index 0000000..089dcb9
--- /dev/null
+++ b/performance/lmbench3/src/Makefile
@@ -0,0 +1,506 @@
+# $Id$
+
+# Make targets:
+#
+# lmbench	[default] builds the benchmark suite for the current os/arch
+# results	builds, configures run parameters, and runs the benchmark
+# rerun		reruns the benchmark using the same parameters as last time
+# scaling	reruns the benchmark using same parameters as last time,
+#		except it asks what scaling value to use
+# hardware	reruns the hardware benchmarks using the same parameters
+# os		reruns the OS benchmarks using the same parameters
+# clean		cleans out sources and run configuration
+# clobber	clean and removes the bin directories
+# shar		obsolete, use cd .. && make shar
+# depend	builds make dependencies (needs gcc)
+# debug		builds all the benchmarks with '-g' debugging flag
+# assembler	builds the .s files for each benchmark
+#
+# This is largely self configuring.  Most stuff is pretty portable.  
+#
+# If you don't have gcc, try make CC=cc and see if that works.
+#
+# If you want to do cross-compilation try make OS=armv5tel-linux-gnu
+# or whatever your OS string should be in the target environment.
+# Since many embedded development environments also have a special
+# cross-compiler, you might want to also select a particular compiler,
+# so your build command would look something like:
+#	make OS=armv5tel-linux-gnu CC=gcc-arm
+#
+# Overriding the OS and CC make parameters needs to be done as an
+# argument to make, not as an environment variable.  See above comments.
+#
+
+# I finally know why Larry Wall's Makefile says "Grrrr".
+SHELL=/bin/sh
+
+CC=`../scripts/compiler`
+MAKE=`../scripts/make`
+AR=ar
+ARCREATE=cr
+
+# base of installation location
+BASE=/usr/local
+O= ../bin/unknown
+D= ../doc
+TRUE=/bin/true
+OS=`../scripts/os`
+TARGET=`../scripts/target`
+BINDIR=../bin/$(OS)
+CONFIG=../bin/$(OS)/`../scripts/config`
+UTILS=../scripts/target ../scripts/os ../scripts/gnu-os ../scripts/compiler \
+	../scripts/info ../scripts/info-template ../scripts/version \
+	../scripts/config ../scripts/config-run ../scripts/results \
+	../scripts/lmbench ../scripts/make ../scripts/build
+INSTALL=cp
+RESULTS=Results/$(OS)
+SAMPLES=lmbench/Results/aix/rs6000 lmbench/Results/hpux/snake \
+	lmbench/Results/irix/indigo2 lmbench/Results/linux/pentium \
+	lmbench/Results/osf1/alpha lmbench/Results/solaris/ss20* 
+
+COMPILE=$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS)
+
+INCS =	bench.h lib_mem.h lib_tcp.h lib_udp.h stats.h timing.h
+
+SRCS =  bw_file_rd.c bw_mem.c bw_mmap_rd.c bw_pipe.c bw_tcp.c bw_udp.c	\
+	bw_unix.c							\
+	cache.c clock.c disk.c enough.c flushdisk.c getopt.c hello.c	\
+	lat_connect.c lat_ctx.c	lat_fcntl.c lat_fifo.c lat_fs.c 	\
+	lat_mem_rd.c lat_mmap.c lat_ops.c lat_pagefault.c lat_pipe.c 	\
+	lat_proc.c lat_rpc.c lat_select.c lat_sig.c lat_syscall.c	\
+	lat_tcp.c lat_udp.c lat_unix.c lat_unix_connect.c lat_sem.c	\
+	lat_usleep.c lat_pmake.c  					\
+	lib_debug.c lib_mem.c lib_stats.c lib_tcp.c lib_timing.c 	\
+	lib_udp.c lib_unix.c lib_sched.c				\
+	line.c lmdd.c lmhttp.c par_mem.c par_ops.c loop_o.c memsize.c 	\
+	mhz.c msleep.c rhttp.c seek.c timing_o.c tlb.c stream.c		\
+	bench.h lib_debug.h lib_tcp.h lib_udp.h lib_unix.h names.h 	\
+	stats.h timing.h version.h
+
+ASMS =  $O/bw_file_rd.s $O/bw_mem.s $O/bw_mmap_rd.s $O/bw_pipe.s 	\
+	$O/bw_tcp.s $O/bw_udp.s $O/bw_unix.s $O/clock.s			\
+	$O/disk.s $O/enough.s $O/flushdisk.s $O/getopt.s $O/hello.s	\
+	$O/lat_connect.s $O/lat_ctx.s lat_fcntl.s $O/lat_fifo.s		\
+	$O/lat_fs.s $O/lat_mem_rd.s $O/lat_mmap.s $O/lat_ops.s		\
+	$O/lat_pagefault.s $O/lat_pipe.s $O/lat_proc.s $O/lat_rpc.s	\
+	$O/lat_select.s $O/lat_sig.s $O/lat_syscall.s $O/lat_tcp.s	\
+	$O/lat_udp.s $O/lat_unix.s $O/lat_unix_connect.s $O/lat_sem.s	\
+	$O/lib_debug.s $O/lib_mem.s	\
+	$O/lib_stats.s $O/lib_tcp.s $O/lib_timing.s $O/lib_udp.s	\
+	$O/lib_unix.s $O/lib_sched.s					\
+	$O/line.s $O/lmdd.s $O/lmhttp.s $O/par_mem.s	\
+	$O/par_ops.s $O/loop_o.s $O/memsize.s $O/mhz.s $O/msleep.s	\
+	$O/rhttp.s $O/timing_o.s $O/tlb.s $O/stream.s			\
+	$O/cache.s $O/lat_dram_page.s $O/lat_pmake.s $O/lat_rand.s	\
+	$O/lat_usleep.s $O/lat_cmd.s
+EXES =	$O/bw_file_rd $O/bw_mem $O/bw_mmap_rd $O/bw_pipe $O/bw_tcp 	\
+	$O/bw_unix $O/hello						\
+	$O/lat_select $O/lat_pipe $O/lat_rpc $O/lat_syscall $O/lat_tcp	\
+	$O/lat_udp $O/lat_mmap $O/mhz $O/lat_proc $O/lat_pagefault	\
+	$O/lat_connect $O/lat_fs $O/lat_sig $O/lat_mem_rd $O/lat_ctx	\
+	$O/lat_sem 							\
+	$O/memsize $O/lat_unix $O/lmdd $O/timing_o $O/enough		\
+	$O/msleep $O/loop_o $O/lat_fifo $O/lmhttp $O/lat_http		\
+	$O/lat_fcntl $O/disk $O/lat_unix_connect $O/flushdisk		\
+	$O/lat_ops $O/line $O/tlb $O/par_mem $O/par_ops 		\
+	$O/stream
+OPT_EXES=$O/cache $O/lat_dram_page $O/lat_pmake $O/lat_rand 		\
+	$O/lat_usleep $O/lat_cmd
+LIBOBJS= $O/lib_tcp.o $O/lib_udp.o $O/lib_unix.o $O/lib_timing.o 	\
+	$O/lib_mem.o $O/lib_stats.o $O/lib_debug.o $O/getopt.o		\
+	$O/lib_sched.o
+
+lmbench: $(UTILS)
+	@env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="$(CC)" OS="$(OS)" ../scripts/build all
+	-@env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="-k $(MAKEFLAGS)" CC="$(CC)" OS="$(OS)" ../scripts/build opt
+
+results: lmbench
+	@env OS="${OS}" ../scripts/config-run
+	@env OS="${OS}" ../scripts/results
+
+rerun: lmbench
+	@if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; fi
+	@env OS="${OS}" ../scripts/results
+
+scaling: lmbench
+	@if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; \
+	 else ../scripts/config-scaling $(CONFIG); fi
+	@env OS="${OS}" ../scripts/results
+
+hardware: lmbench
+	@if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; fi
+	@env OS="${OS}" BENCHMARK_HARDWARE=YES BENCHMARK_OS=NO  ../scripts/results
+
+os: lmbench
+	@if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; fi
+	@env OS="${OS}" BENCHMARK_HARDWARE=NO  BENCHMARK_OS=YES ../scripts/results
+
+install: lmbench
+	@env CFLAGS=-O MAKE="$(MAKE)"  MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build install-target
+
+install-target:
+	if [ ! -d $(BASE) ]; then mkdir $(BASE); fi
+	if [ ! -d $(BASE)/bin ]; then mkdir $(BASE)/bin; fi
+	if [ ! -d $(BASE)/include ]; then mkdir $(BASE)/include; fi
+	if [ ! -d $(BASE)/lib ]; then mkdir $(BASE)/lib; fi
+	cp $(EXES) $(BASE)/bin
+	cp $(INCS) $(BASE)/include
+	cp $O/lmbench.a $(BASE)/lib/libmbench.a
+	cd ../doc; env MAKEFLAGS="$(MAKEFLAGS)" make CC="${CC}" OS="${OS}" BASE="$(BASE)" install
+
+
+# No special handling for all these
+all: $(EXES) $O/lmbench
+opt: $(OPT_EXES)
+asm: $(ASMS)
+$(ASMS):
+	$(CC) -S $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o $@ `basename $@ .s`.c
+
+Wall:
+	@env CFLAGS="-Wall -ansi" MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build all opt
+
+debug:
+	@env CFLAGS="-g -O" MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build all opt
+
+assembler:
+	@env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build asm
+
+bk.ver: ../SCCS/s.ChangeSet
+	rm -f bk.ver
+	-echo `bk prs -hr+ -d'$$if(:SYMBOL:){:SYMBOL: }:UTC:' ../ChangeSet;` > bk.ver
+	touch bk.ver
+
+dist: bk.ver
+	@if [ "X`cd ..; bk sfiles -c`" != "X" ]; then \
+		echo "modified files!"; \
+		false; \
+	 fi
+	@if [ "X`cd ..; bk pending`" != "X" ]; then \
+		echo "pending changes!"; \
+		false; \
+	 fi
+	cd ..; \
+		SRCDIR=`pwd`; \
+		DIR=`basename $${SRCDIR}`; \
+		VERSION=`cat src/bk.ver| awk '{print $$1;}' | sed -e 's/Version-//g'`; \
+		cd ..; \
+		bk clone $${DIR} /tmp/lmbench-$${VERSION}; \
+		cd /tmp/lmbench-$${VERSION}; \
+		bk sfiles | xargs touch; \
+		sleep 5; \
+		bk get -s; \
+		for d in doc results scripts src; do \
+			cd $$d; bk get -s; cd ..; \
+		done; \
+		bk sfiles -U -g | xargs touch; \
+		cd src; \
+		make bk.ver; \
+		cd /tmp; \
+		tar czf $${SRCDIR}/../lmbench-$${VERSION}.tgz \
+			lmbench-$${VERSION}; \
+		rm -rf /tmp/lmbench-$${VERSION};
+
+get $(SRCS):
+	-get -s $(SRCS)
+
+edit get-e:
+	get -e -s $(SRCS)
+
+clean:
+	/bin/rm -f ../bin/*/CONFIG ../bin/*/*.[oas]
+	/bin/rm -f *.[oas]
+
+clobber:
+	/bin/rm -rf ../bin* SHAR
+
+shar:
+	cd ../.. && shar lmbench/Results/Makefile $(SAMPLES) lmbench/scripts/* lmbench/src/Makefile lmbench/src/*.[ch] > lmbench/SHAR
+
+depend: ../scripts/depend
+	../scripts/depend
+
+testmake: $(SRCS) $(UTILS) # used by scripts/make to test gmake
+	@true
+
+.PHONY: lmbench results rerun hardware os install all Wall debug \
+	install install-target dist get edit get-e clean clobber \
+	share depend testmake
+
+$O/lmbench : ../scripts/lmbench
+	rm -f $O/lmbench
+	sed -e "s/<version>/`cat bk.ver`/g" < ../scripts/lmbench > $O/lmbench
+	chmod +x $O/lmbench
+
+$O/lmbench.a: $(LIBOBJS)
+	/bin/rm -f $O/lmbench.a
+	$(AR) $(ARCREATE) $O/lmbench.a $(LIBOBJS)
+	-ranlib $O/lmbench.a
+
+$O/lib_timing.o : lib_timing.c $(INCS)
+	$(COMPILE) -c lib_timing.c -o $O/lib_timing.o
+$O/lib_mem.o : lib_mem.c $(INCS)
+	$(COMPILE) -c lib_mem.c -o $O/lib_mem.o
+$O/lib_tcp.o : lib_tcp.c $(INCS)
+	$(COMPILE) -c lib_tcp.c -o $O/lib_tcp.o
+$O/lib_udp.o : lib_udp.c $(INCS)
+	$(COMPILE) -c lib_udp.c -o $O/lib_udp.o
+$O/lib_unix.o : lib_unix.c $(INCS)
+	$(COMPILE) -c lib_unix.c -o $O/lib_unix.o
+$O/lib_debug.o : lib_debug.c $(INCS)
+	$(COMPILE) -c lib_debug.c -o $O/lib_debug.o
+$O/lib_stats.o : lib_stats.c $(INCS)
+	$(COMPILE) -c lib_stats.c -o $O/lib_stats.o
+$O/lib_sched.o : lib_sched.c $(INCS)
+	$(COMPILE) -c lib_sched.c -o $O/lib_sched.o
+$O/getopt.o : getopt.c $(INCS)
+	$(COMPILE) -c getopt.c -o $O/getopt.o
+
+$(UTILS) :
+	-cd ../scripts; make get
+
+# Do not remove the next line, $(MAKE) depend needs it
+# MAKEDEPEND follows
+$O/rhttp.s:rhttp.c timing.h stats.h bench.h 
+$O/rhttp:  rhttp.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/rhttp rhttp.c $O/lmbench.a $(LDLIBS)
+
+$O/http.s:http.c timing.h stats.h bench.h 
+$O/http:  http.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/http http.c $O/lmbench.a $(LDLIBS)
+
+$O/flushdisk.s:flushdisk.c 
+$O/flushdisk:  flushdisk.c 
+	$(COMPILE) -DMAIN -o $O/flushdisk flushdisk.c
+
+$O/mhz.s: mhz.c timing.h stats.h bench.h
+$O/mhz: mhz.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/mhz mhz.c $O/lmbench.a $(LDLIBS) -lm
+
+$O/lat_ctx.s: lat_ctx.c timing.h stats.h bench.h
+$O/lat_ctx: lat_ctx.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_ctx lat_ctx.c $O/lmbench.a $(LDLIBS)
+
+$O/lmhttp.s:lmhttp.c timing.h stats.h bench.h
+$O/lmhttp:  lmhttp.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lmhttp lmhttp.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_http.s:lat_http.c timing.h stats.h bench.h
+$O/lat_http:  lat_http.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_http lat_http.c $O/lmbench.a $(LDLIBS)
+
+$O/bw_file_rd.s:bw_file_rd.c timing.h stats.h bench.h
+$O/bw_file_rd:  bw_file_rd.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/bw_file_rd bw_file_rd.c $O/lmbench.a $(LDLIBS)
+
+$O/bw_mem.s:bw_mem.c timing.h stats.h bench.h
+$O/bw_mem:  bw_mem.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/bw_mem bw_mem.c $O/lmbench.a $(LDLIBS)
+
+$O/bw_mmap_rd.s:bw_mmap_rd.c timing.h stats.h bench.h
+$O/bw_mmap_rd:  bw_mmap_rd.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/bw_mmap_rd bw_mmap_rd.c $O/lmbench.a $(LDLIBS)
+
+$O/bw_pipe.s:bw_pipe.c timing.h stats.h bench.h
+$O/bw_pipe:  bw_pipe.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/bw_pipe bw_pipe.c $O/lmbench.a $(LDLIBS)
+
+$O/bw_tcp.s:bw_tcp.c bench.h timing.h stats.h lib_tcp.h
+$O/bw_tcp:  bw_tcp.c bench.h timing.h stats.h lib_tcp.h $O/lmbench.a
+	$(COMPILE) -o $O/bw_tcp bw_tcp.c $O/lmbench.a $(LDLIBS)
+
+$O/bw_udp.s:bw_udp.c bench.h timing.h stats.h lib_udp.h
+$O/bw_udp:  bw_udp.c bench.h timing.h stats.h lib_udp.h $O/lmbench.a
+	$(COMPILE) -o $O/bw_udp bw_udp.c $O/lmbench.a $(LDLIBS)
+
+$O/bw_unix.s:bw_unix.c timing.h stats.h bench.h
+$O/bw_unix:  bw_unix.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/bw_unix bw_unix.c $O/lmbench.a $(LDLIBS)
+
+$O/disk.s:disk.c flushdisk.c bench.h timing.h stats.h lib_tcp.h
+$O/disk:  disk.c flushdisk.c bench.h timing.h stats.h lib_tcp.h $O/lmbench.a
+	$(COMPILE) -o $O/disk disk.c $O/lmbench.a $(LDLIBS)
+
+$O/clock.s:clock.c timing.h stats.h bench.h
+$O/clock:  clock.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/clock clock.c $O/lmbench.a $(LDLIBS)
+
+$O/hello.s:hello.c
+$O/hello:  hello.c $O/lmbench.a
+	$(COMPILE) -o $O/hello hello.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_alarm.s:lat_alarm.c timing.h stats.h bench.h
+$O/lat_alarm:  lat_alarm.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_alarm lat_alarm.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_connect.s:lat_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h
+$O/lat_connect:  lat_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_connect lat_connect.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_unix_connect.s:lat_unix_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h
+$O/lat_unix_connect:  lat_unix_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_unix_connect lat_unix_connect.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_fs.s:lat_fs.c timing.h stats.h bench.h
+$O/lat_fs:  lat_fs.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_fs lat_fs.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_fcntl.s:lat_fcntl.c timing.h stats.h bench.h
+$O/lat_fcntl:  lat_fcntl.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_fcntl lat_fcntl.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_mem_rd.s:lat_mem_rd.c timing.h stats.h bench.h
+$O/lat_mem_rd:  lat_mem_rd.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_mem_rd lat_mem_rd.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_mem_rd2.s:lat_mem_rd2.c timing.h stats.h bench.h
+$O/lat_mem_rd2:  lat_mem_rd2.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_mem_rd2 lat_mem_rd2.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_mem_wr.s:lat_mem_wr.c timing.h stats.h bench.h
+$O/lat_mem_wr:  lat_mem_wr.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_mem_wr lat_mem_wr.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_mem_wr2.s:lat_mem_wr2.c timing.h stats.h bench.h
+$O/lat_mem_wr2:  lat_mem_wr2.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_mem_wr2 lat_mem_wr2.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_mmap.s:lat_mmap.c timing.h stats.h bench.h
+$O/lat_mmap:  lat_mmap.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_mmap lat_mmap.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_mmaprd.s:lat_mmaprd.c timing.h stats.h bench.h
+$O/lat_mmaprd:  lat_mmaprd.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_mmaprd lat_mmaprd.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_ops.s:lat_ops.c timing.h stats.h bench.h
+$O/lat_ops:  lat_ops.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_ops lat_ops.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_pagefault.s:lat_pagefault.c timing.h stats.h bench.h
+$O/lat_pagefault:  lat_pagefault.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_pagefault lat_pagefault.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_pipe.s:lat_pipe.c timing.h stats.h bench.h
+$O/lat_pipe:  lat_pipe.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_pipe lat_pipe.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_fifo.s:lat_fifo.c timing.h stats.h bench.h
+$O/lat_fifo:  lat_fifo.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_fifo lat_fifo.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_proc.s:lat_proc.c timing.h stats.h bench.h
+$O/lat_proc:  lat_proc.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_proc lat_proc.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_rpc.s:lat_rpc.c timing.h stats.h bench.h
+$O/lat_rpc:  lat_rpc.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_rpc lat_rpc.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_sig.s:lat_sig.c timing.h stats.h bench.h
+$O/lat_sig:  lat_sig.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_sig lat_sig.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_syscall.s:lat_syscall.c timing.h stats.h bench.h
+$O/lat_syscall:  lat_syscall.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_syscall lat_syscall.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_select.s:  lat_select.c timing.h stats.h bench.h
+$O/lat_select:  lat_select.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_select lat_select.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_tcp.s:lat_tcp.c timing.h stats.h bench.h lib_tcp.h
+$O/lat_tcp:  lat_tcp.c timing.h stats.h bench.h lib_tcp.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_tcp lat_tcp.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_udp.s:lat_udp.c timing.h stats.h bench.h lib_udp.h
+$O/lat_udp:  lat_udp.c timing.h stats.h bench.h lib_udp.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_udp lat_udp.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_unix.s:lat_unix.c timing.h stats.h bench.h
+$O/lat_unix:  lat_unix.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_unix lat_unix.c $O/lmbench.a $(LDLIBS)
+
+$O/lib_tcp.s:lib_tcp.c bench.h lib_tcp.h
+$O/lib_tcp:  lib_tcp.c bench.h lib_tcp.h $O/lmbench.a
+	$(COMPILE) -o $O/lib_tcp lib_tcp.c $O/lmbench.a $(LDLIBS)
+
+$O/lib_udp.s:lib_udp.c bench.h lib_udp.h
+$O/lib_udp:  lib_udp.c bench.h lib_udp.h $O/lmbench.a
+	$(COMPILE) -o $O/lib_udp lib_udp.c $O/lmbench.a $(LDLIBS)
+
+$O/lmdd.s:lmdd.c timing.h stats.h bench.h
+$O/lmdd:  lmdd.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lmdd lmdd.c $O/lmbench.a $(LDLIBS)
+
+$O/enough.s:enough.c timing.h stats.h bench.h
+$O/enough:  enough.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/enough enough.c $O/lmbench.a $(LDLIBS)
+
+$O/loop_o.s:loop_o.c timing.h stats.h bench.h
+$O/loop_o:  loop_o.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/loop_o loop_o.c $O/lmbench.a $(LDLIBS)
+
+$O/timing_o.s:timing_o.c timing.h stats.h bench.h
+$O/timing_o:  timing_o.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/timing_o timing_o.c $O/lmbench.a $(LDLIBS)
+
+$O/memsize.s:memsize.c timing.h stats.h bench.h
+$O/memsize:  memsize.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/memsize memsize.c $O/lmbench.a $(LDLIBS)
+
+$O/msleep.s:msleep.c timing.h stats.h bench.h
+$O/msleep:  msleep.c timing.h stats.h bench.h 
+	$(COMPILE) -o $O/msleep msleep.c
+
+$O/line.s:  line.c timing.h stats.h bench.h
+$O/line:  line.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/line line.c $O/lmbench.a $(LDLIBS)
+
+$O/tlb.s:tlb.c timing.h stats.h bench.h
+$O/tlb:  tlb.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/tlb tlb.c $O/lmbench.a $(LDLIBS)
+
+$O/cache.s:cache.c timing.h stats.h bench.h
+$O/cache:  cache.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/cache cache.c $O/lmbench.a $(LDLIBS)
+
+$O/par_mem.s:par_mem.c timing.h stats.h bench.h
+$O/par_mem:  par_mem.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/par_mem par_mem.c $O/lmbench.a $(LDLIBS)
+
+$O/par_ops.s:par_ops.c timing.h stats.h bench.h
+$O/par_ops:  par_ops.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/par_ops par_ops.c $O/lmbench.a $(LDLIBS)
+
+$O/stream.s:stream.c timing.h stats.h bench.h
+$O/stream:  stream.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/stream stream.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_sem.s:lat_sem.c timing.h stats.h bench.h
+$O/lat_sem:  lat_sem.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_sem lat_sem.c $O/lmbench.a $(LDLIBS)
+
+$O/par_list.s:par_list.c timing.h stats.h bench.h
+$O/par_list:  par_list.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/par_list par_list.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_dram_page.s:lat_dram_page.c timing.h stats.h bench.h
+$O/lat_dram_page:  lat_dram_page.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_dram_page lat_dram_page.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_usleep.s:lat_usleep.c timing.h stats.h bench.h
+$O/lat_usleep:  lat_usleep.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_usleep lat_usleep.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_pmake.s:lat_pmake.c timing.h stats.h bench.h
+$O/lat_pmake:  lat_pmake.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_pmake lat_pmake.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_rand.s:lat_rand.c timing.h stats.h bench.h
+$O/lat_rand:  lat_rand.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_rand lat_rand.c $O/lmbench.a $(LDLIBS)
+
+$O/lat_cmd.s:lat_cmd.c timing.h stats.h bench.h
+$O/lat_cmd:  lat_cmd.c timing.h stats.h bench.h $O/lmbench.a
+	$(COMPILE) -o $O/lat_cmd lat_cmd.c $O/lmbench.a $(LDLIBS)
+
diff --git a/performance/lmbench3/src/TODO b/performance/lmbench3/src/TODO
new file mode 100644
index 0000000..47c3ff2
--- /dev/null
+++ b/performance/lmbench3/src/TODO
@@ -0,0 +1,107 @@
+$Id$
+
+Add standard deviation and other statistics calculations to "make stats"
+in results.  Alternatively, we might report min, 1Q, median, 3Q, max,
+as standard deviation for non-normal distributions isn't always sensible.
+
+Add flags to various file-related benchmarks bw_file_rd, bw_mmap_rd,
+lat_fcntl.c, lat_fs, lat_mmap, and lat_pagefault, for parallelism
+which selects whether each instance has its own file or shares a
+file.
+
+Figure out how to improve lat_select.  It doesn't really work for
+multi-processor systems.  Linus suggests that we have each process
+do some amount of work, and vary the amount of work until context
+switch times for the producer degrade.  The current architecture
+of lat_select is too synchronous and favors simple hand-off
+scheduling too much.  From Linus.
+
+Look into threads vs. process scaling.  benchmp currently uses
+separate processes (via fork()); some benchmarks such as page
+faults and VM mapping might have very different performance
+for threads vs. processes since Linux (at least) has per-memory
+space locks for many of these things.  From Linus.
+
+Add a '-f' option to lat_ctx which causes the work to be floating point
+summation (so we get floating point state too).  (Suggestion by Ingo Molnar)
+
+Add a threads benchmark suite (context switch, mutex, semaphore, ...).
+
+Create a new process for each measurement, rather than reusing the same
+process.  This is mostly to get different page layouts and mostly impacts
+the memory latency benchmarks, although it can also affect lat_ctx.
+
+Write/extend the results processing system/scripts to graph/display/
+process results in the "-P <parallelism>" dimension, and to properly
+handle results with differing parallelism when reporting standard
+results.  The parallelism is stored in the results file as SYNC_MAX.
+
+Add "bw_udp" benchmark to measure UDP bandwidth
+[in progress]
+
+Make a bw_tcp mode that measures bandwidth for each block and graph that
+as offset/bandwidth.
+
+Make the disk stuff autosize such that you get the same number of data
+points regardless of disk size.
+
+Fix the getsummary to include simple calls.
+
+Think about the issues of int/long/long long/double/long double
+load/stores.  Maybe do them all.  This will (at least) require
+adding a test to scripts/build for the presence of long double
+on this system.
+
+Make all results print out bandwidths in powers of 10/sizes in powers of two.
+
+Documentation on porting.
+
+Check that file size is right in the benchmarking system.
+
+Compiler version info included in results.  XXX - do this!
+
+memory store latency (complex)
+	Why can't I just use the read one and make it write?
+	Well, because the read one is list oriented and I need to figure
+	out reasonable math for the write case.  The read one is a load
+	per statement whereas the write one will be more work, I think.
+
+RPC numbers reserved for the benchmark.
+
+Check all the error outputs and make sure they are consistent.
+
+On all the normalized graphs, make sure that they mean the same thing.
+I do not think that the bandwidth measurements are "correct" in this
+sense.
+
+Document the timing.c interfaces.
+
+Run the whole suite through gcc -Wall and fix all the errors.  Also make
+sure that it compiles and has the right sizes for 64 bit OS.
+
+[Mon Jul  1 13:30:01 PDT 1996, after meeting w/ Kevin]
+
+Do the load latency like so
+
+	loop:
+		load	r1
+		{
+		increase the number of nops until they start to make the
+		run time longer - the last one was the memory latency.
+		}
+		use the register
+		{
+		increase the number of nops until they start to make the
+		run time longer - the last one was the cache fill shadow.
+		}
+		repeat
+
+Do the same thing w/ a varying number of loads (& their uses), showing 
+the number of outstanding loads implemented to L1, L2, mem.
+
+Do hand made assembler to get accurate numbers.  Provide C source that 
+mimics the hand made assembler for new machines.
+
+Think about a report format for the hardware stuff that showed the
+numbers as triples L1/L2/mem (or quadruples for alphas).
+
diff --git a/performance/lmbench3/src/bench.h b/performance/lmbench3/src/bench.h
new file mode 100644
index 0000000..8166408
--- /dev/null
+++ b/performance/lmbench3/src/bench.h
@@ -0,0 +1,323 @@
+/*
+ * $Id$
+ */
+#ifndef _BENCH_H
+#define _BENCH_H
+
+#ifdef WIN32
+#include <windows.h>
+typedef unsigned char bool_t;
+#endif
+
+#include	<assert.h>
+#include        <ctype.h>
+#include        <stdio.h>
+#ifndef WIN32
+#include        <unistd.h>
+#endif
+#include        <stdlib.h>
+#include        <fcntl.h>
+#include        <signal.h>
+#include        <errno.h>
+#ifndef WIN32
+#include        <strings.h>
+#endif
+#include        <sys/types.h>
+#ifndef WIN32
+#include        <sys/mman.h>
+#endif
+#include        <sys/stat.h>
+#ifndef WIN32
+#include        <sys/wait.h>
+#include	<time.h>
+#include        <sys/time.h>
+#include        <sys/socket.h>
+#include        <sys/un.h>
+#include        <sys/resource.h>
+#define PORTMAP
+#include	<rpc/rpc.h>
+#endif
+#include	<rpc/types.h>
+
+#include 	<stdarg.h>
+#ifndef HAVE_uint
+typedef unsigned int uint;
+#endif
+
+#ifndef HAVE_uint64
+#ifdef HAVE_uint64_t
+typedef uint64_t uint64;
+#else /* HAVE_uint64_t */
+typedef unsigned long long uint64;
+#endif /* HAVE_uint64_t */
+#endif /* HAVE_uint64 */
+
+#ifndef HAVE_int64
+#ifdef HAVE_int64_t
+typedef int64_t int64;
+#else /* HAVE_int64_t */
+typedef long long int64;
+#endif /* HAVE_int64_t */
+#endif /* HAVE_int64 */
+
+#define NO_PORTMAPPER
+
+#include	"stats.h"
+#include	"timing.h"
+#include	"lib_debug.h"
+#include	"lib_tcp.h"
+#include	"lib_udp.h"
+#include	"lib_unix.h"
+
+
+#ifdef	DEBUG
+#	define		debug(x) fprintf x
+#else
+#	define		debug(x)
+#endif
+#ifdef	NO_PORTMAPPER
+#define	TCP_SELECT	-31233
+#define	TCP_XACT	-31234
+#define	TCP_CONTROL	-31235
+#define	TCP_DATA	-31236
+#define	TCP_CONNECT	-31237
+#define UDP_XACT	-31238
+#define UDP_DATA	-31239
+#else
+#define	TCP_SELECT	(u_long)404038	/* XXX - unregistered */
+#define	TCP_XACT	(u_long)404039	/* XXX - unregistered */
+#define	TCP_CONTROL	(u_long)404040	/* XXX - unregistered */
+#define	TCP_DATA	(u_long)404041	/* XXX - unregistered */
+#define	TCP_CONNECT	(u_long)404042	/* XXX - unregistered */
+#define	UDP_XACT 	(u_long)404032	/* XXX - unregistered */
+#define	UDP_DATA 	(u_long)404033	/* XXX - unregistered */
+#define	VERS		(u_long)1
+#endif
+
+#define	UNIX_CONTROL	"/tmp/lmbench.ctl"
+#define	UNIX_DATA	"/tmp/lmbench.data"
+#define	UNIX_LAT	"/tmp/lmbench.lat"
+
+/*
+ * socket send/recv buffer optimizations
+ */
+#define	SOCKOPT_READ	0x0001
+#define	SOCKOPT_WRITE	0x0002
+#define	SOCKOPT_RDWR	0x0003
+#define	SOCKOPT_PID	0x0004
+#define	SOCKOPT_REUSE	0x0008
+#define	SOCKOPT_NONE	0
+
+#ifndef SOCKBUF
+#define	SOCKBUF		(1024*1024)
+#endif
+
+#ifndef	XFERSIZE
+#define	XFERSIZE	(64*1024)	/* all bandwidth I/O should use this */
+#endif
+
+#if defined(SYS5) || defined(WIN32)
+#define	bzero(b, len)	memset(b, 0, len)
+#define	bcopy(s, d, l)	memcpy(d, s, l)
+#define	rindex(s, c)	strrchr(s, c)
+#endif
+#define	gettime		usecs_spent
+#define	streq		!strcmp
+#define	ulong		unsigned long
+
+#ifndef HAVE_DRAND48
+#ifdef HAVE_RAND
+#define srand48		srand
+#define drand48()	((double)rand() / (double)RAND_MAX)
+#elif defined(HAVE_RANDOM)
+#define srand48		srandom
+#define drand48()	((double)random() / (double)RAND_MAX)
+#endif /* HAVE_RAND */
+#endif /* HAVE_DRAND48 */
+
+#ifdef WIN32
+#include <process.h>
+#define getpid _getpid
+int	gettimeofday(struct timeval *tv, struct timezone *tz);
+#endif
+
+#define	SMALLEST_LINE	32		/* smallest cache line size */
+#define	TIME_OPEN2CLOSE
+
+#define	GO_AWAY	signal(SIGALRM, exit); alarm(60 * 60);
+#define	REAL_SHORT	   50000
+#define	SHORT	 	 1000000
+#define	MEDIUM	 	 2000000
+#define	LONGER		 7500000	/* for networking data transfers */
+#define	ENOUGH		REAL_SHORT
+
+#define	TRIES		11
+
+typedef struct {
+	uint64 u;
+	uint64 n;
+} value_t;
+
+typedef struct {
+	int	N;
+	value_t	v[TRIES];
+} result_t;
+int	sizeof_result(int N);
+void    insertinit(result_t *r);
+void    insertsort(uint64, uint64, result_t *);
+void	save_median();
+void	save_minimum();
+void	set_results(result_t *r);
+result_t* get_results();
+
+
+#define	BENCHO(loop_body, overhead_body, enough) { 			\
+	int 		__i, __N;					\
+	double 		__oh;						\
+	result_t 	__overhead, __r;				\
+	insertinit(&__overhead); insertinit(&__r);			\
+	__N = (enough == 0 || get_enough(enough) <= 100000) ? TRIES : 1;\
+	if (enough < LONGER) {loop_body;} /* warm the cache */		\
+	for (__i = 0; __i < __N; ++__i) {				\
+		BENCH1(overhead_body, enough);				\
+		if (gettime() > 0)					\
+			insertsort(gettime(), get_n(), &__overhead);	\
+		BENCH1(loop_body, enough);				\
+		if (gettime() > 0)					\
+			insertsort(gettime(), get_n(), &__r);		\
+	}								\
+	for (__i = 0; __i < __r.N; ++__i) {				\
+		__oh = __overhead.v[__i].u / (double)__overhead.v[__i].n; \
+		if (__r.v[__i].u > (uint64)((double)__r.v[__i].n * __oh)) \
+			__r.v[__i].u -= (uint64)((double)__r.v[__i].n * __oh);	\
+		else							\
+			__r.v[__i].u = 0;				\
+	}								\
+	*(get_results()) = __r;						\
+}
+
+#define	BENCH(loop_body, enough) { 					\
+	long		__i, __N;					\
+	result_t 	__r;						\
+	insertinit(&__r);						\
+	__N = (enough == 0 || get_enough(enough) <= 100000) ? TRIES : 1;\
+	if (enough < LONGER) {loop_body;} /* warm the cache */		\
+	for (__i = 0; __i < __N; ++__i) {				\
+		BENCH1(loop_body, enough);				\
+		if (gettime() > 0)					\
+			insertsort(gettime(), get_n(), &__r);		\
+	}								\
+	*(get_results()) = __r;						\
+}
+
+#define	BENCH1(loop_body, enough) { 					\
+	double		__usecs;					\
+	BENCH_INNER(loop_body, enough);  				\
+	__usecs = gettime();						\
+	__usecs -= t_overhead() + get_n() * l_overhead();		\
+	settime(__usecs >= 0. ? (uint64)__usecs : 0);			\
+}
+	
+#define	BENCH_INNER(loop_body, enough) { 				\
+	static iter_t	__iterations = 1;				\
+	int		__enough = get_enough(enough);			\
+	iter_t		__n;						\
+	double		__result = 0.;					\
+									\
+	while(__result < 0.95 * __enough) {				\
+		start(0);						\
+		for (__n = __iterations; __n > 0; __n--) {		\
+			loop_body;					\
+		}							\
+		__result = stop(0,0);					\
+		if (__result < 0.99 * __enough 				\
+		    || __result > 1.2 * __enough) {			\
+			if (__result > 150.) {				\
+				double	tmp = __iterations / __result;	\
+				tmp *= 1.1 * __enough;			\
+				__iterations = (iter_t)(tmp + 1);	\
+			} else {					\
+				if (__iterations > (iter_t)1<<27) {	\
+					__result = 0.;			\
+					break;				\
+				}					\
+				__iterations <<= 3;			\
+			}						\
+		}							\
+	} /* while */							\
+	save_n((uint64)__iterations); settime((uint64)__result);	\
+}
+
+/* getopt stuff */
+#define getopt	mygetopt
+#define optind	myoptind
+#define optarg	myoptarg
+#define	opterr	myopterr
+#define	optopt	myoptopt
+extern	int	optind;
+extern	int	opterr;
+extern	int	optopt;
+extern	char	*optarg;
+int	getopt(int ac, char **av, char *opts);
+
+typedef u_long iter_t;
+typedef void (*benchmp_f)(iter_t iterations, void* cookie);
+
+extern void benchmp(benchmp_f initialize, 
+		    benchmp_f benchmark,
+		    benchmp_f cleanup,
+		    int enough, 
+		    int parallel,
+		    int warmup,
+		    int repetitions,
+		    void* cookie
+	);
+
+/* 
+ * These are used by weird benchmarks which cannot return, such as page
+ * protection fault handling.  See lat_sig.c for sample usage.
+ */
+extern void* benchmp_getstate();
+extern iter_t benchmp_interval(void* _state);
+
+/*
+ * Which child process is this?
+ * Returns a number in the range [0, ..., N-1], where N is the
+ * total number of children (parallelism)
+ */
+extern int benchmp_childid();
+
+/*
+ * harvest dead children to prevent zombies
+ */
+extern void sigchld_wait_handler(int signo);
+
+/*
+ * Handle optional pinning/placement of processes on an SMP machine.
+ */
+extern int handle_scheduler(int childno, int benchproc, int nbenchprocs);
+
+#include	"lib_mem.h"
+
+/*
+ * Generated from msg.x which is included here:
+
+	program XACT_PROG {
+	    version XACT_VERS {
+		char
+		RPC_XACT(char) = 1;
+    	} = 1;
+	} = 3970;
+
+ * Please do not edit this file.
+ * It was generated using rpcgen.
+ */
+
+#define XACT_PROG ((u_long)404040)
+#define XACT_VERS ((u_long)1)
+#define RPC_XACT ((u_long)1)
+#define RPC_EXIT ((u_long)2)
+extern char *rpc_xact_1();
+extern char *client_rpc_xact_1();
+
+#endif /* _BENCH_H */
diff --git a/performance/lmbench3/src/bk.ver b/performance/lmbench3/src/bk.ver
new file mode 100644
index 0000000..00750ed
--- /dev/null
+++ b/performance/lmbench3/src/bk.ver
@@ -0,0 +1 @@
+3
diff --git a/performance/lmbench3/src/busy.c b/performance/lmbench3/src/busy.c
new file mode 100644
index 0000000..ab117ba
--- /dev/null
+++ b/performance/lmbench3/src/busy.c
@@ -0,0 +1,10 @@
+volatile int i;
+
+main()
+{
+
+	nice(10);
+	for (;;) getppid();
+	//for (;;) i++;
+	exit(i);
+}
diff --git a/performance/lmbench3/src/bw_file_rd.c b/performance/lmbench3/src/bw_file_rd.c
new file mode 100644
index 0000000..61583c6
--- /dev/null
+++ b/performance/lmbench3/src/bw_file_rd.c
@@ -0,0 +1,192 @@
+/*
+ * bw_file_rd.c - time reading & summing of a file
+ *
+ * Usage: bw_file_rd [-C] [-P <parallelism] [-W <warmup>] [-N <repetitions>] size file
+ *
+ * The intent is that the file is in memory.
+ * Disk benchmarking is done with lmdd.
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+#define	CHK(x)		if ((int)(x) == -1) { perror(#x); exit(1); }
+#ifndef	MIN
+#define	MIN(a, b)	((a) < (b) ? (a) : (b))
+#endif
+
+#define	TYPE	int
+#define	MINSZ	(sizeof(TYPE) * 128)
+
+void	*buf;		/* do the I/O here */
+size_t	xfersize;	/* do it in units of this */
+size_t	count;		/* bytes to move (can't be modified) */
+
+typedef struct _state {
+	char filename[256];
+	int fd;
+	int clone;
+} state_t;
+
+void doit(int fd)
+{
+	int	sum = 0;
+	size_t	size, chunk;
+
+	size = count;
+	chunk = xfersize;
+	while (size >= 0) {
+		if (size < chunk) chunk = size;
+		if (read(fd, buf, MIN(size, chunk)) <= 0) {
+			break;
+		}
+		bread(buf, MIN(size, xfersize));
+		size -= chunk;
+	}
+}
+
+void
+initialize(iter_t iterations, void* cookie)
+{
+	state_t	*state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	state->fd = -1;
+	if (state->clone) {
+		char buf[128];
+		char* s;
+
+		/* copy original file into a process-specific one */
+		sprintf(buf, "%d", (int)getpid());
+		s = (char*)malloc(strlen(state->filename) + strlen(buf) + 1);
+		sprintf(s, "%s%d", state->filename, (int)getpid());
+		if (cp(state->filename, s, S_IREAD|S_IWRITE) < 0) {
+			perror("creating private tempfile");
+			unlink(s);
+			exit(1);
+		}
+		strcpy(state->filename, s);
+	}
+}
+
+void
+init_open(iter_t iterations, void * cookie)
+{
+	state_t	*state = (state_t *) cookie;
+	int	ofd;
+
+	if (iterations) return;
+
+	initialize(0, cookie);
+	CHK(ofd = open(state->filename, O_RDONLY));
+	state->fd = ofd;
+}
+
+void
+time_with_open(iter_t iterations, void * cookie)
+{
+	state_t	*state = (state_t *) cookie;
+	char	*filename = state->filename;
+	int	fd;
+
+	while (iterations-- > 0) {
+		fd= open(filename, O_RDONLY);
+		doit(fd);
+		close(fd);
+	}
+}
+
+void
+time_io_only(iter_t iterations,void * cookie)
+{
+	state_t *state = (state_t *) cookie;
+	int fd = state->fd;
+
+	while (iterations-- > 0) {
+		lseek(fd, 0, 0);
+		doit(fd);
+	}
+}
+
+void
+cleanup(iter_t iterations, void * cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	if (state->fd >= 0) close(state->fd);
+	if (state->clone) unlink(state->filename);
+}
+
+int
+main(int ac, char **av)
+{
+	int	fd;
+	state_t state;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int	c;
+	char	usage[1024];
+	
+	sprintf(usage,"[-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] <size> open2close|io_only <filename>"
+		"\nmin size=%d\n",(int) (XFERSIZE>>10)) ;
+
+	state.clone = 0;
+
+	while (( c = getopt(ac, av, "P:W:N:C")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		case 'C':
+			state.clone = 1;
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind + 3 != ac) { /* should have three arguments left */
+		lmbench_usage(ac, av, usage);
+	}
+
+	strcpy(state.filename,av[optind+2]);
+	count = bytes(av[optind]);
+	if (count < MINSZ) {
+		exit(1);	/* I want this to be quiet */
+	}
+	if (count < XFERSIZE) {
+		xfersize = count;
+	} else {
+		xfersize = XFERSIZE;
+	}
+	buf = (void *)valloc(XFERSIZE);
+	bzero(buf, XFERSIZE);
+
+	if (!strcmp("open2close", av[optind+1])) {
+		benchmp(initialize, time_with_open, cleanup,
+			0, parallel, warmup, repetitions, &state);
+	} else if (!strcmp("io_only", av[optind+1])) {
+		benchmp(init_open, time_io_only, cleanup,
+			0, parallel, warmup, repetitions, &state);
+	} else lmbench_usage(ac, av, usage);
+	bandwidth(count, get_n() * parallel, 0);
+	return (0);
+}
diff --git a/performance/lmbench3/src/bw_mem.c b/performance/lmbench3/src/bw_mem.c
new file mode 100644
index 0000000..19583cf
--- /dev/null
+++ b/performance/lmbench3/src/bw_mem.c
@@ -0,0 +1,468 @@
+/*
+ * bw_mem.c - simple memory write bandwidth benchmark
+ *
+ * Usage: bw_mem [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size what
+ *        what: rd wr rdwr cp fwr frd fcp bzero bcopy
+ *
+ * Copyright (c) 1994-1996 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$";
+
+#include "bench.h"
+
+#define TYPE    int
+
+/*
+ * rd - 4 byte read, 32 byte stride
+ * wr - 4 byte write, 32 byte stride
+ * rdwr - 4 byte read followed by 4 byte write to same place, 32 byte stride
+ * cp - 4 byte read then 4 byte write to different place, 32 byte stride
+ * fwr - write every 4 byte word
+ * frd - read every 4 byte word
+ * fcp - copy every 4 byte word
+ *
+ * All tests do 512 byte chunks in a loop.
+ *
+ * XXX - do a 64bit version of this.
+ */
+void	rd(iter_t iterations, void *cookie);
+void	wr(iter_t iterations, void *cookie);
+void	rdwr(iter_t iterations, void *cookie);
+void	mcp(iter_t iterations, void *cookie);
+void	fwr(iter_t iterations, void *cookie);
+void	frd(iter_t iterations, void *cookie);
+void	fcp(iter_t iterations, void *cookie);
+void	loop_bzero(iter_t iterations, void *cookie);
+void	loop_bcopy(iter_t iterations, void *cookie);
+void	init_overhead(iter_t iterations, void *cookie);
+void	init_loop(iter_t iterations, void *cookie);
+void	cleanup(iter_t iterations, void *cookie);
+
+typedef struct _state {
+	double	overhead;
+	size_t	nbytes;
+	int	need_buf2;
+	int	aligned;
+	TYPE	*buf;
+	TYPE	*buf2;
+	TYPE	*buf2_orig;
+	TYPE	*lastone;
+	size_t	N;
+} state_t;
+
+void	adjusted_bandwidth(uint64 t, uint64 b, uint64 iter, double ovrhd);
+
+int
+main(int ac, char **av)
+{
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	size_t	nbytes;
+	state_t	state;
+	int	c;
+	char	*usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] <size> what [conflict]\nwhat: rd wr rdwr cp fwr frd fcp bzero bcopy\n<size> must be larger than 512";
+
+	state.overhead = 0;
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	/* should have two, possibly three [indicates align] arguments left */
+	state.aligned = state.need_buf2 = 0;
+	if (optind + 3 == ac) {
+		state.aligned = 1;
+	} else if (optind + 2 != ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	nbytes = state.nbytes = bytes(av[optind]);
+	if (state.nbytes < 512) { /* this is the number of bytes in the loop */
+		lmbench_usage(ac, av, usage);
+	}
+
+	if (streq(av[optind+1], "cp") ||
+	    streq(av[optind+1], "fcp") || streq(av[optind+1], "bcopy")) {
+		state.need_buf2 = 1;
+	}
+		
+	if (streq(av[optind+1], "rd")) {
+		benchmp(init_loop, rd, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+	} else if (streq(av[optind+1], "wr")) {
+		benchmp(init_loop, wr, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+	} else if (streq(av[optind+1], "rdwr")) {
+		benchmp(init_loop, rdwr, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+	} else if (streq(av[optind+1], "cp")) {
+		benchmp(init_loop, mcp, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+	} else if (streq(av[optind+1], "frd")) {
+		benchmp(init_loop, frd, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+	} else if (streq(av[optind+1], "fwr")) {
+		benchmp(init_loop, fwr, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+	} else if (streq(av[optind+1], "fcp")) {
+		benchmp(init_loop, fcp, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+	} else if (streq(av[optind+1], "bzero")) {
+		benchmp(init_loop, loop_bzero, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+	} else if (streq(av[optind+1], "bcopy")) {
+		benchmp(init_loop, loop_bcopy, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+	} else {
+		lmbench_usage(ac, av, usage);
+	}
+	adjusted_bandwidth(gettime(), nbytes, 
+			   get_n() * parallel, state.overhead);
+	return(0);
+}
+
+void
+init_overhead(iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+}
+
+void
+init_loop(iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+        state->buf = (TYPE *)valloc(state->nbytes);
+	state->buf2_orig = NULL;
+	state->lastone = (TYPE*)state->buf - 1;
+	state->lastone = (TYPE*)((char *)state->buf + state->nbytes - 512);
+	state->N = state->nbytes;
+
+	if (!state->buf) {
+		perror("malloc");
+		exit(1);
+	}
+	bzero((void*)state->buf, state->nbytes);
+
+	if (state->need_buf2 == 1) {
+		state->buf2_orig = state->buf2 = (TYPE *)valloc(state->nbytes + 2048);
+		if (!state->buf2) {
+			perror("malloc");
+			exit(1);
+		}
+
+		/* default is to have stuff unaligned wrt each other */
+		/* XXX - this is not well tested or thought out */
+		if (state->aligned) {
+			char	*tmp = (char *)state->buf2;
+
+			tmp += 2048 - 128;
+			state->buf2 = (TYPE *)tmp;
+		}
+	}
+}
+
+void
+cleanup(iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	free(state->buf);
+	if (state->buf2_orig) free(state->buf2_orig);
+}
+
+void
+rd(iter_t iterations, void *cookie)
+{	
+	state_t *state = (state_t *) cookie;
+	register TYPE *lastone = state->lastone;
+	register int sum = 0;
+
+	while (iterations-- > 0) {
+	    register TYPE *p = state->buf;
+	    while (p <= lastone) {
+		sum += 
+#define	DOIT(i)	p[i]+
+		DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24)
+		DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52)
+		DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76)
+		DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100)
+		DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) 
+		p[124];
+		p +=  128;
+	    }
+	}
+	use_int(sum);
+}
+#undef	DOIT
+
+void
+wr(iter_t iterations, void *cookie)
+{	
+	state_t *state = (state_t *) cookie;
+	register TYPE *lastone = state->lastone;
+
+	while (iterations-- > 0) {
+	    register TYPE *p = state->buf;
+	    while (p <= lastone) {
+#define	DOIT(i)	p[i] = 1;
+		DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24)
+		DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52)
+		DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76)
+		DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100)
+		DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124);
+		p +=  128;
+	    }
+	}
+}
+#undef	DOIT
+
+void
+rdwr(iter_t iterations, void *cookie)
+{	
+	state_t *state = (state_t *) cookie;
+	register TYPE *lastone = state->lastone;
+	register int sum = 0;
+
+	while (iterations-- > 0) {
+	    register TYPE *p = state->buf;
+	    while (p <= lastone) {
+#define	DOIT(i)	sum += p[i]; p[i] = 1;
+		DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24)
+		DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52)
+		DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76)
+		DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100)
+		DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124);
+		p +=  128;
+	    }
+	}
+	use_int(sum);
+}
+#undef	DOIT
+
+void
+mcp(iter_t iterations, void *cookie)
+{	
+	state_t *state = (state_t *) cookie;
+	register TYPE *lastone = state->lastone;
+	TYPE* p_save = NULL;
+
+	while (iterations-- > 0) {
+	    register TYPE *p = state->buf;
+	    register TYPE *dst = state->buf2;
+	    while (p <= lastone) {
+#define	DOIT(i)	dst[i] = p[i];
+		DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24)
+		DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52)
+		DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76)
+		DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100)
+		DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124);
+		p += 128;
+		dst += 128;
+	    }
+	    p_save = p;
+	}
+	use_pointer(p_save);
+}
+#undef	DOIT
+
+void
+fwr(iter_t iterations, void *cookie)
+{	
+	state_t *state = (state_t *) cookie;
+	register TYPE *lastone = state->lastone;
+	TYPE* p_save = NULL;
+
+	while (iterations-- > 0) {
+	    register TYPE *p = state->buf;
+	    while (p <= lastone) {
+#define	DOIT(i)	p[i]=
+		DOIT(0) DOIT(1) DOIT(2) DOIT(3) DOIT(4) DOIT(5) DOIT(6)
+		DOIT(7) DOIT(8) DOIT(9) DOIT(10) DOIT(11) DOIT(12)
+		DOIT(13) DOIT(14) DOIT(15) DOIT(16) DOIT(17) DOIT(18)
+		DOIT(19) DOIT(20) DOIT(21) DOIT(22) DOIT(23) DOIT(24)
+		DOIT(25) DOIT(26) DOIT(27) DOIT(28) DOIT(29) DOIT(30)
+		DOIT(31) DOIT(32) DOIT(33) DOIT(34) DOIT(35) DOIT(36)
+		DOIT(37) DOIT(38) DOIT(39) DOIT(40) DOIT(41) DOIT(42)
+		DOIT(43) DOIT(44) DOIT(45) DOIT(46) DOIT(47) DOIT(48)
+		DOIT(49) DOIT(50) DOIT(51) DOIT(52) DOIT(53) DOIT(54)
+		DOIT(55) DOIT(56) DOIT(57) DOIT(58) DOIT(59) DOIT(60)
+		DOIT(61) DOIT(62) DOIT(63) DOIT(64) DOIT(65) DOIT(66)
+		DOIT(67) DOIT(68) DOIT(69) DOIT(70) DOIT(71) DOIT(72)
+		DOIT(73) DOIT(74) DOIT(75) DOIT(76) DOIT(77) DOIT(78)
+		DOIT(79) DOIT(80) DOIT(81) DOIT(82) DOIT(83) DOIT(84)
+		DOIT(85) DOIT(86) DOIT(87) DOIT(88) DOIT(89) DOIT(90)
+		DOIT(91) DOIT(92) DOIT(93) DOIT(94) DOIT(95) DOIT(96)
+		DOIT(97) DOIT(98) DOIT(99) DOIT(100) DOIT(101) DOIT(102)
+		DOIT(103) DOIT(104) DOIT(105) DOIT(106) DOIT(107)
+		DOIT(108) DOIT(109) DOIT(110) DOIT(111) DOIT(112)
+		DOIT(113) DOIT(114) DOIT(115) DOIT(116) DOIT(117)
+		DOIT(118) DOIT(119) DOIT(120) DOIT(121) DOIT(122)
+		DOIT(123) DOIT(124) DOIT(125) DOIT(126) DOIT(127) 1;
+		p += 128;
+	    }
+	    p_save = p;
+	}
+	use_pointer(p_save);
+}
+#undef	DOIT
+
+void
+frd(iter_t iterations, void *cookie)
+{	
+	state_t *state = (state_t *) cookie;
+	register int sum = 0;
+	register TYPE *lastone = state->lastone;
+
+	while (iterations-- > 0) {
+	    register TYPE *p = state->buf;
+	    while (p <= lastone) {
+		sum +=
+#define	DOIT(i)	p[i]+
+		DOIT(0) DOIT(1) DOIT(2) DOIT(3) DOIT(4) DOIT(5) DOIT(6)
+		DOIT(7) DOIT(8) DOIT(9) DOIT(10) DOIT(11) DOIT(12)
+		DOIT(13) DOIT(14) DOIT(15) DOIT(16) DOIT(17) DOIT(18)
+		DOIT(19) DOIT(20) DOIT(21) DOIT(22) DOIT(23) DOIT(24)
+		DOIT(25) DOIT(26) DOIT(27) DOIT(28) DOIT(29) DOIT(30)
+		DOIT(31) DOIT(32) DOIT(33) DOIT(34) DOIT(35) DOIT(36)
+		DOIT(37) DOIT(38) DOIT(39) DOIT(40) DOIT(41) DOIT(42)
+		DOIT(43) DOIT(44) DOIT(45) DOIT(46) DOIT(47) DOIT(48)
+		DOIT(49) DOIT(50) DOIT(51) DOIT(52) DOIT(53) DOIT(54)
+		DOIT(55) DOIT(56) DOIT(57) DOIT(58) DOIT(59) DOIT(60)
+		DOIT(61) DOIT(62) DOIT(63) DOIT(64) DOIT(65) DOIT(66)
+		DOIT(67) DOIT(68) DOIT(69) DOIT(70) DOIT(71) DOIT(72)
+		DOIT(73) DOIT(74) DOIT(75) DOIT(76) DOIT(77) DOIT(78)
+		DOIT(79) DOIT(80) DOIT(81) DOIT(82) DOIT(83) DOIT(84)
+		DOIT(85) DOIT(86) DOIT(87) DOIT(88) DOIT(89) DOIT(90)
+		DOIT(91) DOIT(92) DOIT(93) DOIT(94) DOIT(95) DOIT(96)
+		DOIT(97) DOIT(98) DOIT(99) DOIT(100) DOIT(101) DOIT(102)
+		DOIT(103) DOIT(104) DOIT(105) DOIT(106) DOIT(107)
+		DOIT(108) DOIT(109) DOIT(110) DOIT(111) DOIT(112)
+		DOIT(113) DOIT(114) DOIT(115) DOIT(116) DOIT(117)
+		DOIT(118) DOIT(119) DOIT(120) DOIT(121) DOIT(122)
+		DOIT(123) DOIT(124) DOIT(125) DOIT(126) p[127];
+		p += 128;
+	    }
+	}
+	use_int(sum);
+}
+#undef	DOIT
+
+void
+fcp(iter_t iterations, void *cookie)
+{	
+	state_t *state = (state_t *) cookie;
+	register TYPE *lastone = state->lastone;
+
+	while (iterations-- > 0) {
+	    register TYPE *p = state->buf;
+	    register TYPE *dst = state->buf2;
+	    while (p <= lastone) {
+#define	DOIT(i)	dst[i]=p[i];
+		DOIT(0) DOIT(1) DOIT(2) DOIT(3) DOIT(4) DOIT(5) DOIT(6)
+		DOIT(7) DOIT(8) DOIT(9) DOIT(10) DOIT(11) DOIT(12)
+		DOIT(13) DOIT(14) DOIT(15) DOIT(16) DOIT(17) DOIT(18)
+		DOIT(19) DOIT(20) DOIT(21) DOIT(22) DOIT(23) DOIT(24)
+		DOIT(25) DOIT(26) DOIT(27) DOIT(28) DOIT(29) DOIT(30)
+		DOIT(31) DOIT(32) DOIT(33) DOIT(34) DOIT(35) DOIT(36)
+		DOIT(37) DOIT(38) DOIT(39) DOIT(40) DOIT(41) DOIT(42)
+		DOIT(43) DOIT(44) DOIT(45) DOIT(46) DOIT(47) DOIT(48)
+		DOIT(49) DOIT(50) DOIT(51) DOIT(52) DOIT(53) DOIT(54)
+		DOIT(55) DOIT(56) DOIT(57) DOIT(58) DOIT(59) DOIT(60)
+		DOIT(61) DOIT(62) DOIT(63) DOIT(64) DOIT(65) DOIT(66)
+		DOIT(67) DOIT(68) DOIT(69) DOIT(70) DOIT(71) DOIT(72)
+		DOIT(73) DOIT(74) DOIT(75) DOIT(76) DOIT(77) DOIT(78)
+		DOIT(79) DOIT(80) DOIT(81) DOIT(82) DOIT(83) DOIT(84)
+		DOIT(85) DOIT(86) DOIT(87) DOIT(88) DOIT(89) DOIT(90)
+		DOIT(91) DOIT(92) DOIT(93) DOIT(94) DOIT(95) DOIT(96)
+		DOIT(97) DOIT(98) DOIT(99) DOIT(100) DOIT(101) DOIT(102)
+		DOIT(103) DOIT(104) DOIT(105) DOIT(106) DOIT(107)
+		DOIT(108) DOIT(109) DOIT(110) DOIT(111) DOIT(112)
+		DOIT(113) DOIT(114) DOIT(115) DOIT(116) DOIT(117)
+		DOIT(118) DOIT(119) DOIT(120) DOIT(121) DOIT(122)
+		DOIT(123) DOIT(124) DOIT(125) DOIT(126) DOIT(127)
+		p += 128;
+		dst += 128;
+	    }
+	}
+}
+
+void
+loop_bzero(iter_t iterations, void *cookie)
+{	
+	state_t *state = (state_t *) cookie;
+	register TYPE *p = state->buf;
+	register TYPE *dst = state->buf2;
+	register size_t  N = state->N;
+
+	while (iterations-- > 0) {
+		bzero(p, N);
+	}
+}
+
+void
+loop_bcopy(iter_t iterations, void *cookie)
+{	
+	state_t *state = (state_t *) cookie;
+	register TYPE *p = state->buf;
+	register TYPE *dst = state->buf2;
+	register size_t  N = state->N;
+
+	while (iterations-- > 0) {
+		bcopy(p,dst,N);
+	}
+}
+
+/*
+ * Almost like bandwidth() in lib_timing.c, but we need to adjust
+ * bandwidth based upon loop overhead.
+ */
+void adjusted_bandwidth(uint64 time, uint64 bytes, uint64 iter, double overhd)
+{
+#define MB	(1000. * 1000.)
+	extern FILE *ftiming;
+	double secs = ((double)time / (double)iter - overhd) / 1000000.0;
+	double mb;
+	
+        mb = bytes / MB;
+
+	if (secs <= 0.)
+		return;
+
+        if (!ftiming) ftiming = stderr;
+	if (mb < 1.) {
+		(void) fprintf(ftiming, "%.6f ", mb);
+	} else {
+		(void) fprintf(ftiming, "%.2f ", mb);
+	}
+	if (mb / secs < 1.) {
+		(void) fprintf(ftiming, "%.6f\n", mb/secs);
+	} else {
+		(void) fprintf(ftiming, "%.2f\n", mb/secs);
+	}
+}
+
+
diff --git a/performance/lmbench3/src/bw_mmap_rd.c b/performance/lmbench3/src/bw_mmap_rd.c
new file mode 100644
index 0000000..03c27b1
--- /dev/null
+++ b/performance/lmbench3/src/bw_mmap_rd.c
@@ -0,0 +1,185 @@
+/*
+ * bw_mmap_rd.c - time reading & summing of a file using mmap
+ *
+ * Usage: bw_mmap_rd [-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size file
+ *
+ * Sizes less than 2m are not recommended.  Memory is read by summing it up
+ * so the numbers include the cost of the adds.  If you use sizes large
+ * enough, you can compare to bw_mem_rd and get the cost of TLB fills 
+ * (very roughly).
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+#ifdef MAP_FILE
+#	define	MMAP_FLAGS	MAP_FILE|MAP_SHARED
+#else
+#	define	MMAP_FLAGS	MAP_SHARED
+#endif
+
+#define	TYPE	int
+#define	MINSZ	(sizeof(TYPE) * 128)
+#define	CHK(x)	if ((long)(x) == -1) { perror("x"); exit(1); }
+
+typedef struct _state {
+	size_t	nbytes;
+	char	filename[256];
+	int	fd;
+	int	clone;
+	void	*buf;
+} state_t;
+
+void time_no_open(iter_t iterations, void * cookie);
+void time_with_open(iter_t iterations, void * cookie);
+void initialize(iter_t iterations, void *cookie);
+void init_open(iter_t iterations, void *cookie);
+void cleanup(iter_t iterations, void *cookie);
+
+int
+main(int ac, char **av)
+{
+	int	fd;
+	struct	stat sbuf;
+	void	*buf;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	size_t	nbytes;
+	state_t	state;
+	int	c;
+	char	*usage = "[-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] <size> open2close|mmap_only <filename>";
+
+	state.clone = 0;
+
+	while (( c = getopt(ac, av, "P:W:N:C")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		case 'C':
+			state.clone = 1;
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	/* should have three arguments left (bytes type filename) */
+	if (optind + 3 != ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	nbytes = state.nbytes = bytes(av[optind]);
+	strcpy(state.filename,av[optind+2]);
+	CHK(stat(state.filename, &sbuf));
+	if ((S_ISREG(sbuf.st_mode) && nbytes > sbuf.st_size) 
+	    || (nbytes < MINSZ)) {
+		fprintf(stderr,"<size> out of range!\n");
+		exit(1);
+	}
+
+	if (!strcmp("open2close", av[optind+1])) {
+		benchmp(initialize, time_with_open, cleanup,
+			0, parallel, warmup, repetitions, &state);
+	} else if (!strcmp("mmap_only", av[optind+1])) {
+		benchmp(init_open, time_no_open, cleanup,
+			0, parallel, warmup, repetitions, &state);
+	} else {
+		lmbench_usage(ac, av, usage);
+	}
+	bandwidth(nbytes, get_n() * parallel, 0);
+	return (0);
+}
+
+void
+initialize(iter_t iterations, void* cookie)
+{
+	state_t	*state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	state->fd = -1;
+	state->buf = NULL;
+
+	if (state->clone) {
+		char buf[8192];
+		char* s;
+
+		/* copy original file into a process-specific one */
+		sprintf(buf, "%d", (int)getpid());
+		s = (char*)malloc(strlen(state->filename) + strlen(buf) + 1);
+		sprintf(s, "%s%d", state->filename, (int)getpid());
+		if (cp(state->filename, s, S_IREAD|S_IWRITE) < 0) {
+			perror("creating private tempfile");
+			unlink(s);
+			exit(1);
+		}
+		strcpy(state->filename, s);
+	}
+}
+
+void
+init_open(iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	initialize(0, cookie);
+	CHK(state->fd = open(state->filename, 0));
+	CHK(state->buf = mmap(0, state->nbytes, PROT_READ,
+				     MMAP_FLAGS, state->fd, 0));
+}
+
+void
+cleanup(iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+	if (state->buf) munmap(state->buf, state->nbytes);
+	if (state->fd >= 0) close(state->fd);
+	if (state->clone) unlink(state->filename);
+}
+
+void
+time_no_open(iter_t iterations, void * cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	while (iterations-- > 0) {
+	    bread(state->buf, state->nbytes);
+	}
+}
+
+void
+time_with_open(iter_t iterations, void *cookie)
+{
+	state_t *state    = (state_t *) cookie;
+	char 	*filename = state->filename;
+	size_t	nbytes    = state->nbytes;
+	int 	fd;
+	void	*p;
+
+	while (iterations-- > 0) {
+	    CHK(fd = open(filename, 0));
+	    CHK(p = mmap(0, nbytes, PROT_READ, MMAP_FLAGS, fd, 0));
+	    bread(p, nbytes);
+	    close(fd);
+	    munmap(p, nbytes);
+	}
+}
diff --git a/performance/lmbench3/src/bw_pipe.c b/performance/lmbench3/src/bw_pipe.c
new file mode 100644
index 0000000..5d9edfb
--- /dev/null
+++ b/performance/lmbench3/src/bw_pipe.c
@@ -0,0 +1,187 @@
+/*
+ * bw_pipe.c - pipe bandwidth benchmark.
+ *
+ * Usage: bw_pipe [-m <message size>] [-M <total bytes>] \
+ *		[-P <parallelism>] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 1994 Larry McVoy.  
+ * Copyright (c) 2002 Carl Staelin.
+ * Distributed under the FSF GPL with additional restriction that results 
+ * may published only if:
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+void	reader(iter_t iterations, void* cookie);
+void	writer(int writefd, char* buf, size_t xfer);
+
+int	XFER	= 10*1024*1024;
+
+struct _state {
+	int	pid;
+	size_t	xfer;	/* bytes to read/write per "packet" */
+	size_t	bytes;	/* bytes to read/write in one iteration */
+	char	*buf;	/* buffer memory space */
+	int	readfd;
+	int	initerr;
+};
+
+void
+initialize(iter_t iterations, void *cookie)
+{
+	int	pipes[2];
+	struct _state* state = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	state->initerr = 0;
+	if (pipe(pipes) == -1) {
+		perror("pipe");
+		state->initerr = 1;
+		return;
+	}
+	handle_scheduler(benchmp_childid(), 0, 1);
+	switch (state->pid = fork()) {
+	    case 0:
+		close(pipes[0]);
+		handle_scheduler(benchmp_childid(), 1, 1);
+		state->buf = valloc(state->xfer);
+		if (state->buf == NULL) {
+			perror("child: no memory");
+			state->initerr = 4;
+			return;
+		}
+		touch(state->buf, state->xfer);
+		writer(pipes[1], state->buf, state->xfer);
+		return;
+		/*NOTREACHED*/
+	    
+	    case -1:
+		perror("fork");
+		state->initerr = 3;
+		return;
+		/*NOTREACHED*/
+
+	    default:
+		break;
+	}
+	close(pipes[1]);
+	state->readfd = pipes[0];
+	state->buf = valloc(state->xfer + getpagesize());
+	if (state->buf == NULL) {
+		perror("parent: no memory");
+		state->initerr = 4;
+		return;
+	}
+	touch(state->buf, state->xfer + getpagesize());
+	state->buf += 128; /* destroy page alignment */
+}
+
+void
+cleanup(iter_t iterations, void * cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	close(state->readfd);
+	if (state->pid > 0) {
+		kill(state->pid, SIGKILL);
+		waitpid(state->pid, NULL, 0);
+	}
+	state->pid = 0;
+}
+
+void
+reader(iter_t iterations, void * cookie)
+{
+	size_t	done;
+	ssize_t	n;
+	struct _state* state = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		for (done = 0; done < state->bytes; done += n) {
+			if ((n = read(state->readfd, state->buf, state->xfer)) < 0) {
+				perror("bw_pipe: reader: error in read");
+				exit(1);
+			}
+		}
+	}
+}
+
+void
+writer(int writefd, char* buf, size_t xfer)
+{
+	size_t	done;
+	ssize_t	n;
+
+	for ( ;; ) {
+#ifdef TOUCH
+		touch(buf, xfer);
+#endif
+		for (done = 0; done < xfer; done += n) {
+			if ((n = write(writefd, buf, xfer - done)) < 0) {
+				exit(0);
+			}
+		}
+	}
+}
+
+int
+main(int ac, char *av[])
+{
+	struct _state state;
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	char* usage = "[-m <message size>] [-M <total bytes>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n";
+
+	state.xfer = XFERSIZE;	/* per-packet size */
+	state.bytes = XFER;	/* total bytes per call */
+
+	while (( c = getopt(ac, av, "m:M:P:W:N:")) != EOF) {
+		switch(c) {
+		case 'm':
+			state.xfer = bytes(optarg);
+			break;
+		case 'M':
+			state.bytes = bytes(optarg);
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind < ac) {
+		lmbench_usage(ac, av, usage);
+	}
+	/* round up total byte count to a multiple of xfer */
+	if (state.bytes < state.xfer) {
+		state.bytes = state.xfer;
+	} else if (state.bytes % state.xfer) {
+		state.bytes += state.bytes - state.bytes % state.xfer;
+	}
+	benchmp(initialize, reader, cleanup, MEDIUM, parallel, 
+		warmup, repetitions, &state);
+
+	if (gettime() > 0) {
+		fprintf(stderr, "Pipe bandwidth: ");
+		mb(get_n() * parallel * state.bytes);
+	}
+	return(0);
+}
diff --git a/performance/lmbench3/src/bw_tcp.c b/performance/lmbench3/src/bw_tcp.c
new file mode 100644
index 0000000..6a2e8f7
--- /dev/null
+++ b/performance/lmbench3/src/bw_tcp.c
@@ -0,0 +1,251 @@
+/*
+ * bw_tcp.c - simple TCP bandwidth test
+ *
+ * Three programs in one -
+ *	server usage:	bw_tcp -s
+ *	client usage:	bw_tcp [-m <message size>] [-M <total bytes>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname 
+ *	shutdown:	bw_tcp -hostname
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+#include "bench.h"
+
+typedef struct _state {
+	int	sock;
+	uint64	move;
+	int	msize;
+	char	*server;
+	int	fd;
+	char	*buf;
+} state_t;
+
+void	server_main();
+void	client_main(int parallel, state_t *state);
+void	source(int data);
+
+void	initialize(iter_t iterations, void* cookie);
+void	loop_transfer(iter_t iterations, void *cookie);
+void	cleanup(iter_t iterations, void* cookie);
+
+int
+main(int ac, char **av)
+{
+	int	parallel = 1;
+	int	warmup = LONGER;
+	int	repetitions = TRIES;
+	int	shutdown = 0;
+	state_t state;
+	char	*usage = "-s\n OR [-m <message size>] [-M <bytes to move>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] server\n OR -S serverhost\n";
+	int	c;
+	
+	state.msize = 0;
+	state.move = 0;
+
+	/* Rest is client argument processing */
+	while (( c = getopt(ac, av, "sS:m:M:P:W:N:")) != EOF) {
+		switch(c) {
+		case 's': /* Server */
+			if (fork() == 0) {
+				server_main();
+			}
+			exit(0);
+			break;
+		case 'S': /* shutdown serverhost */
+		{
+			int	conn;
+			conn = tcp_connect(optarg, TCP_DATA, SOCKOPT_NONE);
+			write(conn, "0", 1);
+			exit(0);
+		}
+		case 'm':
+			state.msize = bytes(optarg);
+			break;
+		case 'M':
+			state.move = bytes(optarg);
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind < ac - 2 || optind >= ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.server = av[optind++];
+
+	if (state.msize == 0 && state.move == 0) {
+		state.msize = state.move = XFERSIZE;
+	} else if (state.msize == 0) {
+		state.msize = state.move;
+	} else if (state.move == 0) {
+		state.move = state.msize;
+	}
+
+	/* make the number of bytes to move a multiple of the message size */
+	if (state.move % state.msize) {
+		state.move += state.msize - state.move % state.msize;
+	}
+
+	/*
+	 * Default is to warmup the connection for seven seconds, 
+	 * then measure performance over each timing interval.
+	 * This minimizes the effect of opening and initializing TCP 
+	 * connections.
+	 */
+	benchmp(initialize, loop_transfer, cleanup, 
+		0, parallel, warmup, repetitions, &state);
+	if (gettime() > 0) {
+		fprintf(stderr, "%.6f ", state.msize / (1000. * 1000.));
+		mb(state.move * get_n() * parallel);
+	}
+}
+
+void
+initialize(iter_t iterations, void *cookie)
+{
+	int	c;
+	char	buf[100];
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	state->buf = valloc(state->msize);
+	if (!state->buf) {
+		perror("valloc");
+		exit(1);
+	}
+	touch(state->buf, state->msize);
+
+	state->sock = tcp_connect(state->server, TCP_DATA, SOCKOPT_READ|SOCKOPT_WRITE|SOCKOPT_REUSE);
+	if (state->sock < 0) {
+		perror("socket connection");
+		exit(1);
+	}
+	sprintf(buf, "%lu", state->msize);
+	if (write(state->sock, buf, strlen(buf) + 1) != strlen(buf) + 1) {
+		perror("control write");
+		exit(1);
+	}
+}
+
+void 
+loop_transfer(iter_t iterations, void *cookie)
+{
+	int	c;
+	uint64	todo;
+	state_t *state = (state_t *) cookie;
+
+	while (iterations-- > 0) {
+		for (todo = state->move; todo > 0; todo -= c) {
+			if ((c = read(state->sock, state->buf, state->msize)) <= 0) {
+				exit(1);
+			}
+			if (c > todo) c = todo;
+		}
+	}
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	/* close connection */
+	(void)close(state->sock);
+}
+
+void
+server_main()
+{
+	int	data, newdata;
+
+	GO_AWAY;
+
+	data = tcp_server(TCP_DATA, SOCKOPT_WRITE|SOCKOPT_REUSE);
+	if (data < 0) {
+		perror("server socket creation");
+		exit(1);
+	}
+
+	signal(SIGCHLD, sigchld_wait_handler);
+	for ( ;; ) {
+		newdata = tcp_accept(data, SOCKOPT_WRITE);
+		switch (fork()) {
+		    case -1:
+			perror("fork");
+			break;
+		    case 0:
+			source(newdata);
+			exit(0);
+		    default:
+			close(newdata);
+			break;
+		}
+	}
+}
+
+/*
+ * Read the message size.  Keep transferring 
+ * data in message-size sized packets until
+ * the socket goes away.
+ */
+void
+source(int data)
+{
+	size_t	count, m;
+	unsigned long	nbytes;
+	char	*buf, scratch[100];
+
+	/*
+	 * read the message size
+	 */
+	bzero(scratch, 100);
+	if (read(data, scratch, 100) <= 0) {
+		perror("control nbytes");
+		exit(7);
+	}
+	sscanf(scratch, "%lu", &nbytes);
+	m = nbytes;
+
+	/*
+	 * A hack to allow turning off the absorb daemon.
+	 */
+     	if (m == 0) {
+		tcp_done(TCP_DATA);
+		kill(getppid(), SIGTERM);
+		exit(0);
+	}
+
+	buf = valloc(m);
+	bzero(buf, m);
+
+	/*
+	 * Keep sending messages until the connection is closed
+	 */
+	while (write(data, buf, m) == m) {
+#ifdef	TOUCH
+		touch(buf, m);
+#endif
+	}
+	free(buf);
+}
diff --git a/performance/lmbench3/src/bw_udp.c b/performance/lmbench3/src/bw_udp.c
new file mode 100644
index 0000000..8479114
--- /dev/null
+++ b/performance/lmbench3/src/bw_udp.c
@@ -0,0 +1,203 @@
+/*
+ * bw_udp.c - simple UDP bandwidth test
+ *
+ * Three programs in one -
+ *	server usage:	bw_tcp -s
+ *	client usage:	bw_tcp [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname [bytes]
+ *	shutdown:	bw_tcp -S hostname
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+#include "bench.h"
+
+#define MAX_MSIZE (10 * 1024 * 1024)
+
+typedef struct _state {
+	int	sock;
+	int	seq;
+	long	move;
+	long	msize;
+	char	*server;
+	int	fd;
+	char	*buf;
+} state_t;
+
+void	server_main();
+void	client_main(int parallel, state_t *state);
+void	init(iter_t iterations, void *cookie);
+void	cleanup(iter_t iterations, void *cookie);
+
+void	loop_transfer(iter_t iterations, void *cookie);
+
+int
+main(int ac, char **av)
+{
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int	server = 0;
+	state_t state;
+	char	*usage = "-s\n OR [-m <message size>] [-W <warmup>] [-N <repetitions>] server [size]\n OR -S serverhost\n";
+	int	c;
+	uint64	usecs;
+	
+	state.msize = 0;
+	state.move = 10*1024*1024;
+
+	/* Rest is client argument processing */
+	while (( c = getopt(ac, av, "sS:m:W:N:")) != EOF) {
+		switch(c) {
+		case 's': /* Server */
+			if (fork() == 0) {
+				server_main();
+			}
+			exit(0);
+		case 'S': /* shutdown serverhost */
+		{
+			int seq, n;
+			int sock = udp_connect(optarg,
+					       UDP_XACT,
+					       SOCKOPT_NONE);
+			for (n = -1; n > -5; --n) {
+				seq = htonl(n);
+				(void) send(sock, &seq, sizeof(int), 0);
+			}
+			close(sock);
+			exit (0);
+		}
+		case 'm':
+			state.msize = atoi(optarg);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind < ac - 2 || optind >= ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.server = av[optind++];
+	if (optind < ac) {
+		state.move = bytes(av[optind]);
+	}
+	if (state.msize == 0) {
+		state.msize = state.move;
+	}
+	/* make the number of bytes to move a multiple of the message size */
+	if (state.move % state.msize) {
+		state.move += state.move - state.move % state.msize;
+	}
+
+	state.buf = valloc(state.msize);
+	if (!state.buf) {
+		perror("valloc");
+		exit(1);
+	}
+	touch(state.buf, state.msize);
+
+	/*
+	 * Make one run take at least 5 seconds.
+	 * This minimizes the effect of connect & reopening TCP windows.
+	 */
+	benchmp(init, loop_transfer, cleanup, LONGER, parallel, warmup, repetitions, &state );
+
+out:	(void)fprintf(stderr, "socket UDP bandwidth using %s: ", state.server);
+	mb(state.move * get_n() * parallel);
+}
+
+void
+init(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	state->sock = udp_connect(state->server, UDP_XACT, SOCKOPT_NONE);
+	state->seq = 0;
+	state->buf = (char*)malloc(state->msize);
+}
+
+void
+loop_transfer(iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+	char	*server = state->server;
+	int	sock = state->sock;
+	long	control[2], nbytes;
+
+	nbytes = state->move;
+	control[0] = state->move;
+	control[1] = state->msize;
+
+	while (iterations-- > 0) {
+		if (send(sock, control, 2 * sizeof(long), 0) != 2 * sizeof(long)) {
+			perror("bw_udp client: send failed");
+			exit(5);
+		}
+		while (nbytes > 0) {
+			if (recv(sock, state->buf, state->msize, 0) != state->msize) {
+				perror("bw_udp client: recv failed");
+				exit(5);
+			}
+			nbytes -= state->msize;
+		}
+	}
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	close(state->sock);
+	free(state->buf);
+}
+
+void
+server_main()
+{
+	char	*buf = (char*)valloc(MAX_MSIZE);
+	int     sock, namelen, seq = 0;
+	long	nbytes, msize;
+	struct sockaddr_in it;
+
+	GO_AWAY;
+
+	sock = udp_server(UDP_XACT, SOCKOPT_NONE);
+
+	while (1) {
+		namelen = sizeof(it);
+		if (recvfrom(sock, (void*)buf, 2 * sizeof(long), 0, 
+		    (struct sockaddr*)&it, &namelen) < 0) {
+			fprintf(stderr, "bw_udp server: recvfrom: got wrong size\n");
+			exit(9);
+		}
+		nbytes = ntohl(*(long*)buf);
+		msize = ntohl(*((long*)buf + 1));
+		while (nbytes > 0) {
+			if (sendto(sock, (void*)buf, msize, 0, 
+				   (struct sockaddr*)&it, sizeof(it)) < 0) {
+				perror("bw_udp sendto");
+				exit(9);
+			}
+			nbytes -= msize;
+		}
+	}
+}
+
diff --git a/performance/lmbench3/src/bw_unix.c b/performance/lmbench3/src/bw_unix.c
new file mode 100644
index 0000000..aad2078
--- /dev/null
+++ b/performance/lmbench3/src/bw_unix.c
@@ -0,0 +1,190 @@
+/*
+ * bw_unix.c - simple Unix stream socket bandwidth test
+ *
+ * Usage: bw_unix [-m <message size>] [-M <total bytes>] \
+ *		[-P <parallelism>] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 1994 Larry McVoy.  
+ * Copyright (c) 2002 Carl Staelin.
+ * Distributed under the FSF GPL with additional restriction that results 
+ * may published only if:
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+void	reader(iter_t iterations, void * cookie);
+void	writer(int controlfd, int writefd, char* buf, void* cookie);
+
+size_t	XFER	= 10*1024*1024;
+
+struct _state {
+	int	pid;
+	size_t	xfer;	/* bytes to read/write per "packet" */
+	size_t	bytes;	/* bytes to read/write in one iteration */
+	char	*buf;	/* buffer memory space */
+	int	pipes[2];
+	int	control[2];
+	int	initerr;
+};
+
+void 
+initialize(iter_t iterations, void* cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	state->buf = valloc(XFERSIZE);
+	touch(state->buf, XFERSIZE);
+	state->initerr = 0;
+	if (socketpair(AF_UNIX, SOCK_STREAM, 0, state->pipes) == -1) {
+		perror("socketpair");
+		state->initerr = 1;
+		return;
+	}
+	if (pipe(state->control) == -1) {
+		perror("pipe");
+		state->initerr = 2;
+		return;
+	}
+	handle_scheduler(benchmp_childid(), 0, 1);
+	switch (state->pid = fork()) {
+	    case 0:
+	      handle_scheduler(benchmp_childid(), 1, 1);
+		close(state->control[1]);
+		close(state->pipes[0]);
+		writer(state->control[0], state->pipes[1], state->buf, state);
+		return;
+		/*NOTREACHED*/
+	    
+	    case -1:
+		perror("fork");
+		state->initerr = 3;
+		return;
+		/*NOTREACHED*/
+
+	    default:
+		break;
+	}
+	close(state->control[0]);
+	close(state->pipes[1]);
+}
+void 
+cleanup(iter_t iterations, void*  cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	close(state->control[1]);
+	close(state->pipes[0]);
+	if (state->pid > 0) {
+		kill(state->pid, SIGKILL);
+		waitpid(state->pid, NULL, 0);
+	}
+	state->pid = 0;
+}
+
+void 
+reader(iter_t iterations, void* cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+	size_t	done, n;
+	size_t	todo = state->bytes;
+
+	while (iterations-- > 0) {
+		write(state->control[1], &todo, sizeof(todo));
+		for (done = 0; done < todo; done += n) {
+			if ((n = read(state->pipes[0], state->buf, state->xfer)) <= 0) {
+				/* error! */
+				exit(1);
+			}
+		}
+	}
+}
+
+void
+writer(int controlfd, int writefd, char* buf, void* cookie)
+{
+	size_t	todo, n, done;
+	struct _state* state = (struct _state*)cookie;
+
+	for ( ;; ) {
+		read(controlfd, &todo, sizeof(todo));
+		for (done = 0; done < todo; done += n) {
+#ifdef TOUCH
+			touch(buf, XFERSIZE);
+#endif
+			if ((n = write(writefd, buf, state->xfer)) < 0) {
+				/* error! */
+				exit(1);
+			}
+		}
+	}
+}
+
+int
+main(int argc, char *argv[])
+{
+	struct _state state;
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	char* usage = "[-m <message size>] [-M <total bytes>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n";
+
+	state.xfer = XFERSIZE;	/* per-packet size */
+	state.bytes = XFER;	/* total bytes per call */
+
+	while (( c = getopt(argc,argv,"m:M:P:W:N:")) != EOF) {
+		switch(c) {
+		case 'm':
+			state.xfer = bytes(optarg);
+			break;
+		case 'M':
+			state.bytes = bytes(optarg);
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(argc, argv, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(argc, argv);
+			break;
+		}
+	}
+	if (optind == argc - 1) {
+		state.bytes = bytes(argv[optind]);
+	} else if (optind < argc - 1) {
+		lmbench_usage(argc, argv);
+	}
+
+	state.pid = 0;
+
+	/* round up total byte count to a multiple of xfer */
+	if (state.bytes % state.xfer) {
+		state.bytes += state.bytes - state.bytes % state.xfer;
+	}
+
+	benchmp(initialize, reader, cleanup, MEDIUM, parallel, 
+		warmup, repetitions, &state);
+
+	if (gettime() > 0) {
+		fprintf(stderr, "AF_UNIX sock stream bandwidth: ");
+		mb(get_n() * parallel * XFER);
+	}
+	return(0);
+}
+
+
+
diff --git a/performance/lmbench3/src/cache.c b/performance/lmbench3/src/cache.c
new file mode 100644
index 0000000..7bc1651
--- /dev/null
+++ b/performance/lmbench3/src/cache.c
@@ -0,0 +1,750 @@
+/*
+ * cache.c - guess the cache size(s)
+ *
+ * usage: cache [-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+
+struct cache_results {
+	int	len;
+	int	maxlen;
+	int	line;
+	int	mline;
+	double	latency;
+	double	variation;
+	double	ratio;
+	double	slope;
+};
+
+int	find_cache(int start, int n, struct cache_results* p);
+int	collect_data(int start, int line, int maxlen, 
+		     int repetitions, struct cache_results** pdata);
+void	search(int left, int right, int repetitions, 
+	       struct mem_state* state, struct cache_results* p);
+int	collect_sample(int repetitions, struct mem_state* state, 
+			struct cache_results* p);
+double	measure(int size, int repetitions, 
+		double* variation, struct mem_state* state);
+double	remove_chunk(int i, int chunk, int npages, size_t* pages, 
+		       int len, int repetitions, struct mem_state* state);
+int	test_chunk(int i, int chunk, int npages, size_t* pages, int len, 
+		   double *baseline, double chunk_baseline,
+		   int repetitions, struct mem_state* state);
+int	fixup_chunk(int i, int chunk, int npages, size_t* pages, int len, 
+		    double *baseline, double chunk_baseline,
+		    int repetitions, struct mem_state* state);
+void	check_memory(int size, struct mem_state* state);
+void	pagesort(int n, size_t* pages, double* latencies);
+
+#ifdef ABS
+#undef ABS
+#endif
+#define ABS(a) ((a) < 0 ? -(a) : (a))
+
+#define SWAP(a,b) {int _tmp = (a); (a) = (b); (b) = _tmp;}
+
+#define THRESHOLD 1.5
+
+#define	FIVE(m)		m m m m m
+#define	TEN(m)		FIVE(m) FIVE(m)
+#define	FIFTY(m)	TEN(m) TEN(m) TEN(m) TEN(m) TEN(m)
+#define	HUNDRED(m)	FIFTY(m) FIFTY(m)
+#define DEREF		p = (char**)*p;
+
+static char **addr_save = NULL;
+
+void
+mem_benchmark(iter_t iterations, void *cookie)
+{
+	register char **p;
+	struct mem_state* state = (struct mem_state*)cookie;
+
+	p = addr_save ? addr_save : (char**)state->p[0];
+	while (iterations-- > 0) {
+		HUNDRED(DEREF);
+	}
+	addr_save = p;
+}
+
+
+/*
+ * Assumptions:
+ *
+ * 1) Cache lines are a multiple of pointer-size words
+ * 2) Cache lines are no larger than 1/8 of a page (typically 512 bytes)
+ * 3) Pages are an even multiple of cache lines
+ */
+int
+main(int ac, char **av)
+{
+	int	c;
+	int	i, j, n, start, level, prev, min;
+	int	line = -1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int	print_cost = 0;
+	int	maxlen = 32 * 1024 * 1024;
+	int	*levels;
+	double	par, maxpar;
+	char   *usage = "[-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]\n";
+	struct cache_results* r;
+	struct mem_state state;
+
+	while (( c = getopt(ac, av, "cL:M:W:N:")) != EOF) {
+		switch(c) {
+		case 'c':
+			print_cost = 1;
+			break;
+		case 'L':
+			line = atoi(optarg);
+			if (line < sizeof(char*))
+				line = sizeof(char*);
+			break;
+		case 'M':
+			maxlen = bytes(optarg);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	state.width = 1;
+	state.len = maxlen;
+	state.maxlen = maxlen;
+	state.pagesize = getpagesize();
+
+	if (line <= 0) {
+		line = line_find(maxlen, warmup, repetitions, &state);
+		if (line <= 0)
+			line = getpagesize() / 16;
+		state.line = line;
+	}
+
+	n = collect_data(512, line, maxlen, repetitions, &r);
+	r[n-1].line = line;
+	levels = (int*)malloc(n * sizeof(int));
+	bzero(levels, n * sizeof(int));
+
+	for (start = 0, prev = 0, level = 0; 
+	     (i = find_cache(start, n, r)) >= 0; 
+	     ++level, start = i + 1, prev = i) 
+	{
+		/* 
+		 * performance is not greatly improved over main memory,
+		 * so it is likely not a cache boundary
+		 */
+		if (r[i].latency / r[n-1].latency > 0.5) break;
+
+		/* 
+		 * is cache boundary "legal"? (e.g. 2^N or 1.5*2^N) 
+		 * cache sizes are "never" 1.25*2^N or 1.75*2^N
+		 */
+		for (c = r[i].len; c > 0x7; c >>= 1)
+			;
+		if (c == 5 || c == 7) {
+			i++;
+			if (i >= n) break;
+		}
+
+		levels[level] = i;
+	}
+
+	for (i = 0; i < level; ++i) {
+		prev = (i > 0 ? levels[i-1]: -1);
+
+		/* locate most likely cache latency */
+		for (j = min = prev + 1; j < levels[i]; ++j) {
+			if (r[j].latency <= 0.) continue;
+			if (r[min].latency <= 0.
+			    || ABS(r[j].slope) < ABS(r[min].slope)) {
+				min = j;
+			}
+		}
+
+		/* Compute line size */
+		if (i == level - 1) {
+			line = r[n-1].line;
+		} else {
+			j = (levels[i] + levels[i+1]) / 2;
+			for (line = -1; line <= 0 && j < n; ++j) {
+				r[j].line = line_find(r[j].len, warmup,
+						      repetitions, &state);
+				line = r[j].line;
+			}
+		}
+
+		/* Compute memory parallelism for cache */
+		maxpar = par_mem(r[levels[i]-1].len, warmup, 
+				 repetitions, &state);
+
+		fprintf(stderr, 
+		    "L%d cache: %d bytes %.2f nanoseconds %d linesize %.2f parallelism\n",
+		    i+1, r[levels[i]].len, r[min].latency, line, maxpar);
+	}
+
+	/* Compute memory parallelism for main memory */
+	j = n - 1;
+	for (i = n - 1; i >= 0; i--) {
+		if (r[i].latency < 0.) continue;
+		if (r[i].latency > 0.99 * r[n-1].latency)
+			j = i;
+	}
+	par = par_mem(r[j].len, warmup, repetitions, &state);
+
+	fprintf(stderr, "Memory latency: %.2f nanoseconds %.2f parallelism\n",
+		r[n-1].latency, par);
+
+	exit(0);
+}
+
+int
+find_cache(int start, int n, struct cache_results* p)
+{
+	int	i, j, prev;
+	double	max = -1.;
+
+	for (prev = (start == 0 ? start : start - 1); prev > 0; prev--) {
+		if (p[prev].ratio > 0.0) break;
+	}
+
+	for (i = start, j = -1; i < n; ++i) {
+		if (p[i].latency < 0.) continue;
+		if (p[prev].ratio <= p[i].ratio && p[i].ratio > max) {
+			j = i;
+			max = p[i].ratio;
+		} else if (p[i].ratio < max && THRESHOLD < max) {
+			return j;
+		}
+		prev = i;
+	}
+	return -1;
+}
+
+int
+collect_data(int start, int line, int maxlen, 
+	     int repetitions, struct cache_results** pdata)
+{
+	int	i;
+	int	samples;
+	int	idx;
+	int	len = start;
+	int	incr = start / 4;
+	double	latency;
+	double	variation;
+	struct mem_state state;
+	struct cache_results* p;
+
+
+	state.width = 1;
+	state.len = maxlen;
+	state.maxlen = maxlen;
+	state.line = line;
+	state.pagesize = getpagesize();
+	state.addr = NULL;
+
+	/* count the (maximum) number of samples to take */
+	for (len = start, incr = start / 4, samples = 0; len <= maxlen; incr<<=1) {
+		for (i = 0; i < 4 && len <= maxlen; ++i, len += incr)
+			samples++;
+	}
+	*pdata = (struct cache_results*)
+		malloc(samples * sizeof(struct cache_results));
+
+	p = *pdata;
+
+	/* initialize the data */
+	for (len = start, incr = start / 4, idx = 0; len <= maxlen; incr<<=1) {
+		for (i = 0; i < 4 && len <= maxlen; ++i, ++idx, len += incr) {
+			p[idx].len = len;
+			p[idx].line = -1;
+			p[idx].mline = -1;
+			p[idx].latency = -1.;
+			p[idx].ratio = -1.;
+			p[idx].slope = -1.;
+		}
+	}
+
+	/* make sure we have enough memory for the scratch data */
+	while (state.addr == NULL) {
+		mem_initialize(0, &state);
+		if (state.addr == NULL) {
+			maxlen /= 2;
+			state.len = state.maxlen = maxlen;
+			while (p[samples-1].len > maxlen)
+				samples--;
+		}
+	}
+	for (i = 0; i < samples; ++i)
+		p[i].maxlen = maxlen;
+	/* in case the system has laid out the pages well, don't scramble */
+	for (i = 0; i < state.npages; ++i)
+		state.pages[i] = i * state.pagesize;
+
+	p[0].latency = measure(p[0].len, repetitions, &p[0].variation, &state);
+	p[samples-1].latency = measure(p[samples-1].len, repetitions, 
+				       &p[samples-1].variation, &state);
+	while (p[samples-1].latency <= 0.0) {
+		p[samples-1].latency = measure(p[samples-1].len, 
+					       repetitions, 
+					       &p[samples-1].variation, 
+					       &state);
+		--samples;
+	}
+	search(0, samples - 1, repetitions, &state, p);
+
+	/*
+	fprintf(stderr, "%10.10s %8.8s %8.8s %8.8s %8.8s %5.5s %5.5s\n", 
+		"mem size", "latency", "variation", "ratio", "slope", 
+		"line", "mline");
+	for (idx = 0; idx < samples; ++idx) {
+		if (p[idx].latency < 0.) continue;
+		fprintf(stderr, 
+			"%10.6f %8.3f %8.3f %8.3f %8.3f %4d %4d\n", 
+			p[idx].len / (1000. * 1000.), 
+			p[idx].latency, 
+			p[idx].variation, 
+			p[idx].ratio,
+			p[idx].slope,
+			p[idx].line,
+			p[idx].mline);
+	}
+	/**/
+	mem_cleanup(0, &state);
+
+	return samples;
+}
+
+void
+search(int left, int right, int repetitions, 
+       struct mem_state* state, struct cache_results* p)
+{
+	int	middle = left + (right - left) / 2;
+
+	if (p[left].latency > 0.0) {
+		p[left].ratio = p[right].latency / p[left].latency;
+		p[left].slope = (p[left].ratio - 1.) / (double)(right - left);
+		/* we probably have a bad data point, so ignore it */
+		if (p[left].ratio < 0.98) {
+			p[left].latency = p[right].latency;
+			p[left].ratio = 1.;
+			p[left].slope = 0.;
+		}
+	}
+
+	if (middle == left || middle == right)
+		return;
+
+	if (p[left].ratio > 1.35 || p[left].ratio < 0.97) {
+		collect_sample(repetitions, state, &p[middle]);
+		search(middle, right, repetitions, state, p);
+		search(left, middle, repetitions, state, p);
+	}
+	return;
+}
+
+int
+collect_sample(int repetitions, struct mem_state* state, 
+	       struct cache_results* p)
+{
+	int	i, modified, npages;
+	double	baseline;
+
+	npages = (p->len + getpagesize() - 1) / getpagesize();
+        baseline = measure(p->len, repetitions, &p->variation, state);
+	
+	if (npages > 1) {
+		for (i = 0, modified = 1; i < 8 && modified; ++i) {
+			modified = test_chunk(0, npages, npages, 
+					      state->pages, p->len, 
+					      &baseline, 0.0,
+					      repetitions, state);
+		}
+	}
+	p->latency = baseline;
+
+	return (p->latency > 0);
+}
+
+double
+measure(int size, int repetitions, 
+	double* variation, struct mem_state* state)
+{
+	int	i, j, npages, nlines;
+	double	time, median;
+	char	*p;
+	result_t *r, *r_save;
+	size_t	*pages;
+
+	pages = state->pages;
+	npages = (size + getpagesize() - 1) / getpagesize();
+	nlines = state->nlines;
+
+	if (size % getpagesize())
+		nlines = (size % getpagesize()) / state->line;
+
+	r_save = get_results();
+	r = (result_t*)malloc(sizeof_result(repetitions));
+	insertinit(r);
+
+	/* 
+	 * assumes that you have used mem_initialize() to setup the memory
+	 */
+	p = state->base;
+	for (i = 0; i < npages - 1; ++i) {
+		for (j = 0; j < state->nwords; ++j) {
+			*(char**)(p + pages[i] + state->lines[state->nlines - 1] + state->words[j]) = 
+			p + pages[i+1] + state->lines[0] + state->words[j];
+		}
+	}
+	for (j = 0; j < state->nwords; ++j) {
+		*(char**)(p + pages[npages - 1] + state->lines[nlines - 1] + state->words[j]) = 
+			p + pages[0] + state->lines[0] + state->words[(j+1)%state->nwords];
+	}
+
+	/*
+	check_memory(size, state);
+	/**/
+
+	addr_save = NULL;
+	state->p[0] = p + pages[0] + state->lines[0] + state->words[0];
+	/* now, run through the chain once to clear the cache */
+	mem_benchmark((size / sizeof(char*) + 100) / 100, state);
+
+	for (i = 0; i < repetitions; ++i) {
+		BENCH1(mem_benchmark(__n, state); __n = 1;, 0)
+		insertsort(gettime(), get_n(), r);
+	}
+	set_results(r);
+	median = (1000. * (double)gettime()) / (100. * (double)get_n());
+
+	save_minimum();
+	time = (1000. * (double)gettime()) / (100. * (double)get_n());
+
+	/* Are the results stable, or do they vary? */
+	if (time != 0.)
+		*variation = median / time;
+	else
+		*variation = -1.0;
+	set_results(r_save);
+	free(r);
+
+	if (nlines < state->nlines) {
+		for (j = 0; j < state->nwords; ++j) {
+			*(char**)(p + pages[npages - 1] + state->lines[nlines - 1] + state->words[j]) = 
+				p + pages[npages - 1] + state->lines[nlines] + state->words[j];
+		}
+	}
+	/*
+	fprintf(stderr, "%.6f %.2f\n", state->len / (1000. * 1000.), median);
+	/**/
+
+	return median;
+}
+
+
+double
+remove_chunk(int i, int chunk, int npages, size_t* pages, 
+	       int len, int repetitions, struct mem_state* state)
+{
+	int	n, j;
+	double	t, var;
+
+	if (i + chunk < npages) {
+		for (j = 0; j < chunk; ++j) {
+			n = pages[i+j];
+			pages[i+j] = pages[npages-1-j];
+			pages[npages-1-j] = n;
+		}
+	}
+	t = measure(len - chunk * getpagesize(), repetitions, &var, state);
+	if (i + chunk < npages) {
+		for (j = 0; j < chunk; ++j) {
+			n = pages[i+j];
+			pages[i+j] = pages[npages-1-j];
+			pages[npages-1-j] = n;
+		}
+	}
+	
+	return t;
+}
+
+int
+test_chunk(int i, int chunk, int npages, size_t* pages, int len, 
+	   double *baseline, double chunk_baseline,
+	   int repetitions, struct mem_state* state)
+{
+	int	j, k, subchunk;
+	int	modified = 0;
+	int	changed;
+	double	t, tt, nodiff_chunk_baseline;
+
+	if (chunk <= 20 && chunk < npages) {
+		return fixup_chunk(i, chunk, npages, pages, len, baseline, 
+				   chunk_baseline, repetitions, state);
+	}
+
+	nodiff_chunk_baseline = *baseline;
+	subchunk = (chunk + 19) / 20;
+	for (j = i, k = 0; j < i + chunk; j+=subchunk, k++) {
+		if (j + subchunk > i + chunk) subchunk = i + chunk - j;
+
+		t = remove_chunk(j, subchunk, npages, pages, 
+				 len, repetitions, state);
+
+		/*
+		fprintf(stderr, "test_chunk(...): baseline=%G, t=%G, len=%d, chunk=%d, i=%d\n", *baseline, t, len, subchunk, j);
+		/**/
+
+		if (t >= 0.99 * *baseline) continue;
+		if (t >= 0.999 * nodiff_chunk_baseline) continue;
+
+		tt = remove_chunk(j, subchunk, npages, pages, 
+				  len, repetitions, state);
+
+		if (tt > t) t = tt;
+
+		if (t >= 0.99 * *baseline) continue;
+		if (t >= 0.999 * nodiff_chunk_baseline) continue;
+
+		changed = test_chunk(j, subchunk, npages, pages, len,
+				     baseline, t, repetitions, state);
+
+		if (changed) {
+			modified = 1;
+		} else {
+			nodiff_chunk_baseline = t;
+		}
+	}
+	return modified;
+}
+
+/*
+ * This routine is called once we have identified a chunk
+ * that has pages that are suspected of colliding with other
+ * pages.
+ *
+ * The algorithm is to remove all the pages, and then 
+ * slowly add back pages; attempting to add pages with
+ * minimal cost.
+ */
+int
+fixup_chunk(int i, int chunk, int npages, size_t* pages, int len, 
+	    double *baseline, double chunk_baseline,
+	    int repetitions, struct mem_state* state)
+{
+	int	j, k, l, m;
+	int	page, substitute, original;
+	int	ntotalpages, nsparepages;
+	int	subset_len;
+	int	swapped = 0;
+	size_t	*pageset;
+	size_t	*saved_pages;
+	static int	available_index = 0;
+	double	t, tt, low, var, new_baseline;
+	double	latencies[20];
+
+	ntotalpages = state->maxlen / getpagesize();
+	nsparepages = ntotalpages - npages;
+	pageset = state->pages + npages;
+	new_baseline = *baseline;
+
+	saved_pages = (size_t*)malloc(sizeof(size_t) * ntotalpages);
+	bcopy(pages, saved_pages, sizeof(int) * ntotalpages);
+
+	/* move everything to the end of the page list */
+	if (i + chunk < npages) {
+		for (j = 0; j < chunk; ++j) {
+			page = pages[i+j];
+			pages[i+j] = pages[npages-chunk+j];
+			pages[npages-chunk+j] = page;
+		}
+	}
+
+	if (available_index >= nsparepages) available_index = 0;
+
+	/* 
+	 * first try to identify which pages we can definitely keep
+	 */
+	for (j = 0, k = chunk; j < k; ) {
+
+		t = measure((npages - chunk + j + 1) * getpagesize(), 
+			    repetitions, &var, state);
+
+		if (0.995 * t <= chunk_baseline) {
+			latencies[j] = t;
+			++j;	/* keep this page */
+		} else {	
+			--k;	/* this page is probably no good */
+			latencies[k] = t;
+			SWAP(pages[npages - chunk + j], pages[npages - chunk + k]);
+		}
+	}
+	/*
+	 * sort the "bad" pages by increasing latency
+	 */
+	pagesort(chunk - j, &pages[npages - chunk + j], &latencies[j]);
+
+	/*
+	fprintf(stderr, "fixup_chunk: len=%d, chunk=%d, j=%d, baseline=%G, lat[%d]=%G..%G\n", len, chunk, j, *baseline, j, (j < chunk ? latencies[j] : -1.0), latencies[chunk - 1]);
+	/**/
+
+	if (chunk >= npages && j < chunk / 2) {
+		j = chunk / 2;
+		t = measure((npages - chunk + j + 1) * getpagesize(), 
+			    repetitions, &var, state);
+		chunk_baseline = t;
+	}
+
+	for (k = 0; j < chunk && k < 2 * npages; ++k) {
+		original = npages - chunk + j;
+		substitute = nsparepages - 1;
+		substitute -= (k + available_index) % (nsparepages - 1);
+		subset_len = (original + 1) * getpagesize();
+		if (j == chunk - 1 && len % getpagesize()) {
+			subset_len = len;
+		}
+		
+		SWAP(pages[original], pageset[substitute]);
+		t = measure(subset_len, repetitions, &var, state);
+		SWAP(pages[original], pageset[substitute]);
+
+		/*
+		 * try to keep pages ordered by increasing latency
+		 */
+		if (t < latencies[chunk - 1]) {
+			latencies[chunk - 1] = t;
+			SWAP(pages[npages - 1], pageset[substitute]);
+			pagesort(chunk - j, 
+				 &pages[npages - chunk + j], &latencies[j]);
+		}
+		if (0.995 * latencies[j] <= chunk_baseline) {
+			++j;	/* keep this page */
+			++swapped;
+		}
+	}
+				
+	available_index = (k + available_index) % (nsparepages - 1);
+
+	/* measure new baseline, in case we didn't manage to optimally
+	 * replace every page
+	 */
+	if (swapped) {
+		new_baseline = measure(len, repetitions, &var, state);
+
+		/*
+		fprintf(stderr, "fixup_chunk: len=%d, swapped=%d, k=%d, baseline=%G, newbase=%G\n", len, swapped, k, *baseline, new_baseline);
+		/**/
+
+		if (new_baseline >= 0.999 * *baseline) {
+			/* no benefit to these changes; back them out */
+			swapped = 0;
+			bcopy(saved_pages, pages, sizeof(int) * ntotalpages);
+		} else {
+			/* we sped up, so keep these changes */
+			*baseline = new_baseline;
+
+			/* move back to the middle of the pagelist */
+			if (i + chunk < npages) {
+				for (j = 0; j < chunk; ++j) {
+					page = pages[i+j];
+					pages[i+j] = pages[npages-chunk+j];
+					pages[npages-chunk+j] = page;
+				}
+			}
+		}
+	/*
+	} else {
+		fprintf(stderr, "fixup_chunk: len=%d, swapped=%d, k=%d\n", len, swapped, k);
+	/**/
+	}
+	free(saved_pages);
+
+	return swapped;
+}
+
+void
+check_memory(int size, struct mem_state* state)
+{
+	int	i, j, first_page, npages, nwords;
+	int	page, word_count, pagesize;
+	off_t	offset;
+	char	**p, **q;
+	char	**start;
+
+	pagesize = getpagesize();
+	npages = (size + pagesize - 1) / pagesize;
+	nwords = size / sizeof(char*);
+
+	/*
+	fprintf(stderr, "check_memory(%d, ...): entering, %d words\n", size, nwords);
+	/**/
+	word_count = 1;
+	first_page = 0;
+	start = (char**)(state->base + state->pages[0] + state->lines[0] + state->words[0]);
+	for (q = p = (char**)*start; p != start; ) {
+		word_count++;
+		offset = (unsigned long)p - (unsigned long)state->base;
+		page = offset - offset % pagesize;
+		for (j = first_page; j < npages; ++j) {
+			if (page == state->pages[j]) break;
+		}
+		if (j == npages) {
+			for (j = 0; j < first_page; ++j) {
+				if (page == state->pages[j]) break;
+			}
+			if (j == first_page) {
+				fprintf(stderr, 
+					"check_memory: bad memory reference for size %d\n", 
+					size);
+			}
+		}
+		first_page = j % npages;
+		p = (char**)*p;
+		if (word_count & 0x1) q == (char**)*q;
+		if (*p == *q) {
+			fprintf(stderr, "check_memory: unwanted memory cycle! page=%d\n", j);
+			return;
+		}
+	}
+	if (word_count != nwords) {
+		fprintf(stderr, "check_memory: wrong word count, expected %d, got %d\n", nwords, word_count);
+	}
+	/*
+	fprintf(stderr, "check_memory(%d, ...): exiting\n", size);
+	/**/
+}
+
+void
+pagesort(int n, size_t* pages, double* latencies)
+{
+	int	i, j;
+	double	t;
+
+	for (i = 0; i < n - 1; ++i) {
+		for (j = i + 1; j < n; ++j) {
+			if (latencies[i] > latencies[j]) {
+				t = latencies[i]; 
+				latencies[i] = latencies[j];
+				latencies[j] = t;
+				SWAP(pages[i], pages[j]);
+			}
+		}
+	}
+}
diff --git a/performance/lmbench3/src/clock.c b/performance/lmbench3/src/clock.c
new file mode 100644
index 0000000..48ff8a0
--- /dev/null
+++ b/performance/lmbench3/src/clock.c
@@ -0,0 +1,24 @@
+/*
+ * clock.c
+ *
+ * calculate the minimum timing loop length that gives us significant results
+ */
+#include "bench.h"
+
+char	*id = "$Id$";
+char	*revision = "$Revision$";
+
+main()
+{
+	uint64	enough;
+	double	t_overhead, l_overhead;
+
+	enough = compute_enough(15);
+	printf("ENOUGH=%lu\n", (unsigned long)enough); fflush(stdout);
+	t_overhead = timing_overhead(enough);
+	printf("TIMING_OVERHEAD=%f\n", t_overhead); fflush(stdout);
+	l_overhead = loop_overhead(enough, t_overhead);
+	printf("LOOP_OVERHEAD=%f\n", l_overhead);
+	printf("# version [%s]\n", revision);
+	exit(0);
+}
diff --git a/performance/lmbench3/src/disk.c b/performance/lmbench3/src/disk.c
new file mode 100644
index 0000000..c3f1154
--- /dev/null
+++ b/performance/lmbench3/src/disk.c
@@ -0,0 +1,310 @@
+/*
+ * disk - calculate zone bandwidths and seek times
+ *
+ * Usage: disk device
+ *
+ * Copyright (c) 1994-1997 Larry McVoy.  All rights reserved.
+ * Bits of this are derived from work by Ethan Solomita.
+ */
+
+#include	<stdio.h>
+#include	<sys/types.h>
+#include	<unistd.h>
+#include	<stdlib.h>
+#include	"bench.h"
+#include	"flushdisk.c"
+
+#ifndef sgi
+#define	NO_LSEEK64
+#define	off64_t	long long
+#endif
+#define	SEEKPOINTS	2000
+#define	ZONEPOINTS	150
+
+uint64	disksize(char *);
+int	seekto(int, uint64);
+int	zone(char *disk, int oflag, int bsize);
+int	seek(char *disk, int oflag);
+
+int
+main(int ac, char **av)
+{
+	fprintf(stderr, "\"Seek times for %s\n", av[1]);
+	seek(av[1], 0);
+	fprintf(stderr, "\n");
+	fprintf(stderr, "\"Zone bandwidth for %s\n", av[1]);
+	zone(av[1], 0, (1<<20));
+	return (0);
+}
+
+int
+zone(char *disk, int oflag, int bsize)
+{
+	char	*buf;
+	int	usecs;
+	int	error;
+	int	n;
+	int	fd;
+	uint64	off;
+	int	stride;
+
+	if ((fd = open(disk, oflag)) == -1) {
+		perror(disk);
+		exit(1);
+	}
+	buf = valloc(bsize);
+	if (!buf) {
+		perror("valloc");
+		exit(1);
+	}
+	bzero(buf, bsize);
+#ifdef	linux
+	flushdisk(fd);
+#endif
+
+	/*
+	 * We want ZONEPOINTS data points 
+	 * but the stride has to be at least 512 and a 512 multiple.
+	 * Weird code below for precision.
+	 */
+	off = disksize(disk);
+	off /= ZONEPOINTS;
+	stride = off;
+	if (stride < 512) stride = 512;
+	stride += 511;
+	stride >>= 9;
+	stride <<= 9;
+
+	/*
+	 * Very small disks such as ZIP drives get a 256K blocksize.
+	 * As measured on my SCSI ZIP, there seems to be no
+	 * difference between 256K and 1MB for sequential reads.
+	 * XXX - there is a rotational delay difference but that's tough.
+	 */
+	if (bsize > stride) bsize = 256<<10;
+	if (bsize > stride) stride = bsize;
+
+	off *= ZONEPOINTS;
+	debug((stdout, "stride=%d bs=%d size=%dM points=%d\n",
+	    stride, bsize, (int)(off >> 20), (int)(off/stride)));
+
+	/*
+	 * Read buf's worth of data every stride and time it.
+	 * Don't include the rotational delay.
+	 * This first I/O outside the loop is to catch read/write permissions.
+	 */
+
+#define	IO(a,b,c)	(oflag == 0 ? (n = read(a,b,c)) : (n = write(a,b,c)))
+
+	error = IO(fd, buf, 512);
+	if (error == -1) {
+		perror(disk);
+		exit(1);
+	}
+	off = 512;
+	for ( ;; ) {
+		if (IO(fd, buf, 1024) != 1024) {
+			exit(0);
+		}
+		off += 1024;
+		start(0);
+		if (IO(fd, buf, bsize) != bsize) {
+			exit(0);
+		}
+		usecs = stop(0, 0);
+		off += bsize;
+		fprintf(stderr, "%.01f %.2f\n",
+		    off/1000000.0, (double)bsize/usecs);
+		off += stride;
+		if (seekto(fd, off)) {
+			exit(0);
+		}
+	}
+	exit(0);
+}
+
+/*
+ * Seek - calculate seeks as a function of distance.
+ */
+#undef	IO
+#define	IO(a,b,c)	error = (oflag == 0 ? read(a,b,c) : write(a,b,c)); \
+			if (error == -1) { perror("io"); exit(1); }
+#define	IOSIZE		512
+#define	TOOSMALL	1000	/* seeks this small are cached */
+#define	TOOBIG		1000000	/* seeks this big are remapped or weirdos */
+				/* zip drives have seeks this long */
+
+int
+seek(char *disk, int oflag)
+{
+	char	*buf;
+	int	fd;
+	off64_t	size;
+	off64_t	begin, end;
+	int	usecs;
+	int	error;
+	int	tot_msec = 0, tot_io = 0;
+	int	stride;
+
+	if ((fd = open(disk, oflag)) == -1) {
+		perror(disk);
+		return (-1);
+	}
+#ifdef	linux
+	flushdisk(fd);
+#endif
+	size = disksize(disk);
+	buf = valloc(IOSIZE);
+	bzero(buf, IOSIZE);
+
+	/*
+	 * We flip back and forth, in strides of 1MB (typically).
+	 * If we have a 100MB fd, that means we do
+	 * 1, 99, 2, 98, etc.
+	 *
+	 * We want around SEEK POINTS data points 
+	 * but the stride has to be at least 512 and a 512 multiple.
+	 */
+	stride = size / SEEKPOINTS;
+	if (stride < 512) stride = 512;
+	stride += 511;
+	stride >>= 9;
+	stride <<= 9;
+
+	debug((stdout, "stride=%d size=%dM points=%d\n",
+	    stride, (int)(size >> 20), (int)(size/stride)));
+
+	end = size;
+	begin = 0;
+	seekto(fd, begin);
+	IO(fd, buf, IOSIZE);
+	while (end >= begin + stride*2) {
+		end -= stride;
+		start(0);
+		seekto(fd, end);
+		IO(fd, buf, IOSIZE);
+		usecs = stop(0, 0);
+		if (usecs > TOOSMALL && usecs < TOOBIG) {
+			tot_io++; tot_msec += usecs/1000;
+			fprintf(stderr, "%.01f %.02f\n",
+			    (end - begin - stride) / 1000000., usecs/1000.);
+		}
+
+		begin += stride;
+		start(0);
+		seekto(fd, begin);
+		IO(fd, buf, IOSIZE);
+		usecs = stop(0, 0);
+		if (usecs > TOOSMALL && usecs < TOOBIG) {
+			tot_io++; tot_msec += usecs/1000;
+			fprintf(stderr, "%.01f %.02f\n",
+			    (end + stride - begin) / 1000000., usecs/1000.);
+		}
+	}
+	/*
+	 * This is wrong, it should take the 1/3 stroke seek average.
+	avg_msec = (double)tot_msec/tot_io;
+	fprintf(stderr, "Average time == %.04f\n", avg_msec);
+	 */
+	return (0);
+}
+
+/*
+ * Calculate how big a device is.
+ *
+ * To avoid 32 bit problems, our units are MB.
+ */
+#define	FORWARD		(512<<20)
+#define	FORWARD1	(64<<20)
+#define	FORWARD2	(1<<20)
+
+/*
+ * Go forward in 1GB chunks until you can't.
+ * Go backwards in 128MB chunks until you can.
+ * Go forwards in 1MB chunks until you can't and return that -1.
+ */
+uint64
+disksize(char *disk)
+{
+	int	fd = open(disk, 0);
+	char	buf[512];
+	uint64	off = 0;
+
+	if (fd == -1) {
+		perror("usage: disksize device");
+		return(0);
+	}
+	/*
+	 * Go forward until it doesn't work.
+	 */
+	for ( ;; ) {
+		off += FORWARD;
+		if (seekto(fd, off)) {
+			debug((stdout, "seekto(%dM) failed\n", (int)(off>>20)));
+			off -= FORWARD;
+			break;
+		}
+		if ((read(fd, buf, sizeof(buf)) != sizeof(buf))) {
+			debug((stdout, "read @ %dM failed\n", (int)(off>>20)));
+			off -= FORWARD;
+			break;
+		}
+	}
+
+	for ( ;; ) {
+		off += FORWARD1;
+		if (seekto(fd, off)) {
+			debug((stdout, "seekto(%dM) failed\n", (int)(off>>20)));
+			off -= FORWARD1;
+			break;
+		}
+		if ((read(fd, buf, sizeof(buf)) != sizeof(buf))) {
+			debug((stdout, "read @ %dM failed\n", (int)(off>>20)));
+			off -= FORWARD1;
+			break;
+		}
+	}
+
+	for ( ;; ) {
+		off += FORWARD2;
+		if (seekto(fd, off)) {
+			debug((stdout, "seekto(%dM) failed\n", (int)(off>>20)));
+			off -= FORWARD2;
+			break;
+		}
+		if ((read(fd, buf, sizeof(buf)) != sizeof(buf))) {
+			debug((stdout, "read @ %dM failed\n", (int)(off>>20)));
+			off -= FORWARD2;
+			break;
+		}
+	}
+
+	debug((stdout, "disksize(%s) = %d MB\n", disk, (int)(off >> 20)));
+	return (off);
+}
+
+#define	BIGSEEK	(1<<30)
+
+int
+seekto(int fd, uint64 off)
+{
+#ifdef	__linux__
+	extern	loff_t llseek(int, loff_t, int);
+
+	if (llseek(fd, (loff_t)off, SEEK_SET) == (loff_t)-1) {
+		return(-1);
+	}
+	return (0);
+#else
+	uint64	here = 0;
+
+	lseek(fd, 0, 0);
+	while ((uint64)(off - here) > (uint64)BIGSEEK) {
+		if (lseek(fd, BIGSEEK, SEEK_CUR) == -1) break;
+		here += BIGSEEK;
+	}
+	assert((uint64)(off - here) <= (uint64)BIGSEEK);
+	if (lseek(fd, (int)(off - here), SEEK_CUR) == -1) return (-1);
+	return (0);
+#endif
+}
diff --git a/performance/lmbench3/src/enough.c b/performance/lmbench3/src/enough.c
new file mode 100644
index 0000000..6128ccf
--- /dev/null
+++ b/performance/lmbench3/src/enough.c
@@ -0,0 +1,13 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+extern	int	get_enough(int);
+
+int
+main()
+{
+	putenv("LOOP_O=0.0");
+	putenv("TIMING_O=0.0");
+	printf("%u\n", get_enough(0));
+	return (0);
+}
diff --git a/performance/lmbench3/src/flushdisk.c b/performance/lmbench3/src/flushdisk.c
new file mode 100644
index 0000000..0c422ed
--- /dev/null
+++ b/performance/lmbench3/src/flushdisk.c
@@ -0,0 +1,42 @@
+#ifdef	linux
+/*
+ * flushdisk() - linux block cache clearing
+ */
+
+#include	<stdio.h>
+#include	<sys/types.h>
+#include	<fcntl.h>
+#include	<unistd.h>
+#include	<stdlib.h>
+#include	<sys/ioctl.h>
+#include	<sys/mount.h>
+
+int
+flushdisk(int fd)
+{
+	int	ret = ioctl(fd, BLKFLSBUF, 0);
+	usleep(100000);
+	return (ret);
+}
+
+#endif
+
+#ifdef	MAIN
+int
+main(int ac, char **av)
+{
+#ifdef	linux
+	int	fd;
+	int	i;
+
+	for (i = 1; i < ac; ++i) {
+		fd = open(av[i], 0);
+		if (flushdisk(fd)) {
+			exit(1);
+		}
+		close(fd);
+	}
+#endif
+	exit(0);
+}
+#endif
diff --git a/performance/lmbench3/src/getopt.c b/performance/lmbench3/src/getopt.c
new file mode 100644
index 0000000..a868959
--- /dev/null
+++ b/performance/lmbench3/src/getopt.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 1997 L.W.McVoy 
+ *
+ * SGI's fucking getopt doesn't follow GNU's reset policy.  Isn't having
+ * N versions of Unix a great thing for the world?  I'm gonna move to NT
+ * if these assholes don't get their act together.
+ *
+ * This version handles
+ *
+ *	-	(leaves it and returns)
+ *	-a
+ *	-abcd
+ *	-r <arg>
+ *	-r<arg>
+ *	-abcr <arg>
+ *	-abcr<arg>
+ *	-r<arg> -R<arg>, etc.
+ *
+ * A special form is "d|" instead of "d:".  This means the arg has to be
+ * right next to the option.  
+ * Another special form is "d;".  This means the option must be right next
+ * to the option letter and can not be blank.
+ */
+#include "bench.h"
+static char *id = "%@%";
+
+int	optopt;		/* option that is in error, if we return an error */
+int     optind;		/* next arg in argv we process */
+char   *optarg;		/* argument to an option */
+static int n;
+
+int
+getopt(int ac, char **av, char *opts)
+{
+	char	*t;
+
+	if (!optind) {
+		optind = 1;
+		n = 1;
+	}
+	debug((stderr, "GETOPT ind=%d n=%d arg=%s av[%d]='%s'\n",
+	    optind, n, optarg ? optarg : "", optind, av[optind]));
+
+	if ((optind >= ac) || (av[optind][0] != '-') || !av[optind][1]) {
+		return (EOF);
+	}
+
+	assert(av[optind][n]);
+	for (t = (char *)opts; *t; t++) {
+		if (*t == av[optind][n]) {
+			break;
+		}
+	}
+	if (!*t) { 
+		optopt = av[optind][n];
+		debug((stderr, "\tran out of option letters\n"));
+		return ('?');
+	}
+
+	/* OK, we found a legit option, let's see what to do with it.
+	 * If it isn't one that takes an option, just advance and return.
+	 */
+	if (t[1] != ':' && t[1] != '|' && t[1] != ';') {
+		if (!av[optind][n+1]) {
+			optind++;
+			n = 1;
+		} else {
+			n++;
+		}
+		debug((stderr, "\tLegit singleton %c\n", *t));
+		return (*t);
+	}
+
+	/* got one with an option, see if it is cozied up to the flag */
+	if (av[optind][n+1]) {
+		if (av[optind][n+1]) {
+			optarg = &av[optind][n+1];
+		} else {
+			optarg = 0;
+		}
+		optind++;
+		n = 1;
+		debug((stderr, "\t%c with %s\n", *t, optarg));
+		return (*t);
+	} 
+	
+	/* If it was not there, and it is optional, OK */
+	if (t[1] == '|') {
+		optarg = 0;
+		optind++;
+		n = 1;
+		debug((stderr, "\t%c without arg\n", *t));
+		return (*t);
+	}
+
+	/* was it supposed to be there? */
+	if (t[1] == ';') {
+		optarg = 0;
+		optind++;
+		optopt = *t;
+		debug((stderr, "\twanted another word\n"));
+		return ('?');
+	}
+
+	/* Nope, there had better be another word. */
+	if ((optind + 1 == ac) || (av[optind+1][0] == '-')) {
+		optopt = av[optind][n];
+		debug((stderr, "\twanted another word\n"));
+		return ('?');
+	}
+	optarg = av[optind+1];
+	optind += 2;
+	n = 1;
+	debug((stderr, "\t%c with arg %s\n", *t, optarg));
+	return (*t);
+}
+
+#ifdef	TEST
+
+/* XXX a.out -y file */
+main(int ac, char **av)
+{
+	extern	char *optarg;
+	extern	int optind;
+	char	*comment = 0;
+	int	c;
+
+	while ((c = getopt(ac, av, "fnpsx:y|")) != -1) {
+		switch (c) {
+		    case 'f': 
+		    case 'n': 
+		    case 'p': 
+		    case 's': 
+		    	printf("Got option %c\n", c);
+			break;
+		    case 'x':
+		    case 'y':
+		   	 comment = optarg; 
+			 printf("Got optarg %s with -%c\n", comment, c);
+			 break;
+		    case '?':
+			fprintf(stderr, "bad option %c\n", optopt);
+			break;
+		    default:
+			fprintf(stderr, "unknown ret %c\n", c);
+			break;
+	    	}
+	}
+	while (av[optind]) {
+		printf("av[%d] = %s\n", optind, av[optind++]);
+	}
+	exit(0);
+}
+#endif
diff --git a/performance/lmbench3/src/hello.c b/performance/lmbench3/src/hello.c
new file mode 100644
index 0000000..15a2493
--- /dev/null
+++ b/performance/lmbench3/src/hello.c
@@ -0,0 +1,8 @@
+#include "bench.h"
+
+int
+main()
+{
+	write(1, "Hello world\n", 12);
+	return (0);
+}
diff --git a/performance/lmbench3/src/lat_cmd.c b/performance/lmbench3/src/lat_cmd.c
new file mode 100644
index 0000000..412a4d2
--- /dev/null
+++ b/performance/lmbench3/src/lat_cmd.c
@@ -0,0 +1,100 @@
+/*
+ * lat_cmd.c - time to complete a given command line
+ *
+ * usage: lat_cmd [-P <parallelism>] [-W <warmup>] [-N <repetitions>] cmd...
+ *
+ * Copyright (c) 2004 Carl Staelin. Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+void bench(iter_t iterations, void *cookie);
+void cleanup(iter_t iterations, void *cookie);
+
+typedef struct _state {
+	char**	argv;
+	pid_t	pid;
+} state_t;
+
+int 
+main(int ac, char **av)
+{
+	int c;
+	int i;
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	char buf[1024];
+	state_t state;
+	char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] cmdline...\n";
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind >= ac) {
+		lmbench_usage(ac, av, usage);
+	}
+	state.argv = (char**)malloc((ac - optind + 1) * sizeof(char*));
+	state.pid = 0;
+	for (i = 0; i < ac - optind; ++i) {
+		state.argv[i] = av[optind + i];
+	}
+	state.argv[i] = NULL;
+
+	benchmp(NULL, bench, NULL, 0, parallel, warmup, repetitions, &state);
+	micro("lat_cmd", get_n());
+	return (0);
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	state_t* state = (state_t*)cookie;
+
+	if (iterations) return;
+
+	if (state->pid) {
+		kill(state->pid, SIGKILL);
+		waitpid(state->pid, NULL, 0);
+		state->pid = 0;
+	}
+}
+	
+void 
+bench(register iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	signal(SIGCHLD, SIG_DFL);
+	while (iterations-- > 0) {
+		switch (state->pid = fork()) {
+		case '0':
+			execvp(state->argv[0], state->argv);
+			/*NOTREACHED*/
+		default:
+			break;
+		}
+		waitpid(state->pid, NULL, 0);
+		state->pid = 0;
+	}
+}
+
diff --git a/performance/lmbench3/src/lat_connect.c b/performance/lmbench3/src/lat_connect.c
new file mode 100644
index 0000000..6639cca
--- /dev/null
+++ b/performance/lmbench3/src/lat_connect.c
@@ -0,0 +1,110 @@
+/*
+ * lat_connect.c - simple TCP connection latency test
+ *
+ * Three programs in one -
+ *	server usage:	lat_connect -s
+ *	client usage:	lat_connect [-N <repetitions>] hostname
+ *	shutdown:	lat_connect -hostname
+ *
+ * lat_connect may not be parallelized because of idiosyncracies
+ * with TCP connection creation.  Basically, if the client tries
+ * to create too many connections too quickly, the system fills
+ * up the set of available connections with TIME_WAIT connections.
+ * We can only measure the TCP connection cost accurately if we
+ * do just a few connections.  Since the parallel harness needs
+ * each child to run for a second, this guarantees that the 
+ * parallel version will generate inaccurate results.
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+#include "bench.h"
+
+typedef struct _state {
+	char	*server;
+} state_t;
+
+void	doclient(iter_t iterations, void * cookie);
+void	server_main();
+
+int
+main(int ac, char **av)
+{
+	state_t state;
+	int	repetitions = TRIES;
+	int 	c;
+	char	buf[256];
+	char	*usage = "-s\n OR [-S] [-N <repetitions>] server\n";
+
+	while (( c = getopt(ac, av, "sSP:W:N:")) != EOF) {
+		switch(c) {
+		case 's': /* Server */
+			if (fork() == 0) {
+				server_main();
+			}
+			exit(0);
+		case 'S': /* shutdown serverhost */
+		{
+			int sock = tcp_connect(av[optind],
+					       TCP_CONNECT,
+					       SOCKOPT_NONE);
+			write(sock, "0", 1);
+			close(sock);
+			exit(0);
+		}
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind + 1 != ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.server = av[optind];
+	benchmp(NULL, doclient, NULL, 0, 1, 0, repetitions, &state);
+
+	sprintf(buf, "TCP/IP connection cost to %s", state.server);
+	micro(buf, get_n());
+	exit(0);
+}
+
+void
+doclient(iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+	register char	*server = state->server;
+	register int 	sock;
+	
+	while (iterations-- > 0) {
+		sock = tcp_connect(server, TCP_CONNECT, SOCKOPT_REUSE);
+		close(sock);
+	}
+}
+
+void
+server_main()
+{
+	int     newsock, sock;
+	char	c ='1';
+
+	GO_AWAY;
+	sock = tcp_server(TCP_CONNECT, SOCKOPT_NONE|SOCKOPT_REUSE);
+	for (;;) {
+		newsock = tcp_accept(sock, SOCKOPT_NONE);
+		if (read(newsock, &c, 1) > 0) {
+			tcp_done(TCP_CONNECT);
+			exit(0);
+		}
+		close(newsock);
+	}
+	/* NOTREACHED */
+}
diff --git a/performance/lmbench3/src/lat_ctx.c b/performance/lmbench3/src/lat_ctx.c
new file mode 100644
index 0000000..4c81af8
--- /dev/null
+++ b/performance/lmbench3/src/lat_ctx.c
@@ -0,0 +1,350 @@
+/*
+ * lat_ctx.c - context switch timer 
+ *
+ * usage: lat_ctx [-P parallelism] [-W <warmup>] [-N <repetitions>] [-s size] #procs [#procs....]
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+
+#define	MAXPROC	2048
+#define	CHUNK	(4<<10)
+#define	TRIPS	5
+#ifndef	max
+#define	max(a, b)	((a) > (b) ? (a) : (b))
+#endif
+
+void	doit(int rd, int wr, int process_size);
+int	create_pipes(int **p, int procs);
+int	create_daemons(int **p, pid_t *pids, int procs, int process_size);
+void	initialize_overhead(iter_t iterations, void* cookie);
+void	cleanup_overhead(iter_t iterations, void* cookie);
+void	benchmark_overhead(iter_t iterations, void* cookie);
+void	initialize(iter_t iterations, void* cookie);
+void	cleanup(iter_t iterations, void* cookie);
+void	benchmark(iter_t iterations, void* cookie);
+
+struct _state {
+	int	process_size;
+	double	overhead;
+	int	procs;
+	pid_t*	pids;
+	int	**p;
+	void*	data;
+};
+
+int
+main(int ac, char **av)
+{
+	int	i, maxprocs;
+	int	c;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	struct _state state;
+	char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] [-s kbytes] processes [processes ...]\n";
+	double	time;
+
+	/*
+	 * Need 4 byte ints.
+	 */
+	if (sizeof(int) != 4) {
+		fprintf(stderr, "Fix sumit() in ctx.c.\n");
+		exit(1);
+	}
+
+	state.process_size = 0;
+	state.overhead = 0.0;
+	state.pids = NULL;
+
+	/*
+	 * If they specified a context size, or parallelism level, get them.
+	 */
+	while (( c = getopt(ac, av, "s:P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		case 's':
+			state.process_size = atoi(optarg) * 1024;
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind > ac - 1)
+		lmbench_usage(ac, av, usage);
+
+	/* compute pipe + sumit overhead */
+	maxprocs = atoi(av[optind]);
+	for (i = optind; i < ac; ++i) {
+		state.procs = atoi(av[i]);
+		if (state.procs > maxprocs)
+			maxprocs = state.procs;
+	}
+	state.procs = maxprocs;
+	benchmp(initialize_overhead, benchmark_overhead, cleanup_overhead, 
+		0, 1, warmup, repetitions, &state);
+	if (gettime() == 0) return(0);
+	state.overhead = gettime();
+	state.overhead /= get_n();
+	fprintf(stderr, "\n\"size=%dk ovr=%.2f\n", 
+		state.process_size/1024, state.overhead);
+
+	/* compute the context switch cost for N processes */
+	for (i = optind; i < ac; ++i) {
+		state.procs = atoi(av[i]);
+		benchmp(initialize, benchmark, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+
+		time = gettime();
+		time /= get_n();
+		time /= state.procs;
+		time -= state.overhead;
+
+		if (time > 0.0)
+			fprintf(stderr, "%d %.2f\n", state.procs, time);
+	}
+
+	return (0);
+}
+
+void
+initialize_overhead(iter_t iterations, void* cookie)
+{
+	int i;
+	int procs;
+	int* p;
+	struct _state* pState = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	pState->pids = NULL;
+	pState->p = (int**)malloc(pState->procs * (sizeof(int*) + 2 * sizeof(int)));
+	p = (int*)&pState->p[pState->procs];
+	for (i = 0; i < pState->procs; ++i) {
+		pState->p[i] = p;
+		p += 2;
+	}
+
+	pState->data = (pState->process_size > 0) ? malloc(pState->process_size) : NULL;
+	if (pState->data)
+		bzero(pState->data, pState->process_size);
+
+	procs = create_pipes(pState->p, pState->procs);
+	if (procs < pState->procs) {
+		cleanup_overhead(0, cookie);
+		exit(1);
+	}
+}
+
+void
+cleanup_overhead(iter_t iterations, void* cookie)
+{
+	int i;
+	struct _state* pState = (struct _state*)cookie;
+
+	if (iterations) return;
+
+     	for (i = 0; i < pState->procs; ++i) {
+		close(pState->p[i][0]);
+		close(pState->p[i][1]);
+	}
+
+	free(pState->p);
+	if (pState->data) free(pState->data);
+}
+
+void
+benchmark_overhead(iter_t iterations, void* cookie)
+{
+	struct _state* pState = (struct _state*)cookie;
+	int	i = 0;
+	int	msg = 1;
+
+	while (iterations-- > 0) {
+		if (write(pState->p[i][1], &msg, sizeof(msg)) != sizeof(msg)) {
+			/* perror("read/write on pipe"); */
+			exit(1);				
+		}
+		if (read(pState->p[i][0], &msg, sizeof(msg)) != sizeof(msg)) {
+			/* perror("read/write on pipe"); */
+			exit(1);
+		}
+		if (++i == pState->procs) {
+			i = 0;
+		}
+		bread(pState->data, pState->process_size);
+	}
+}
+
+void 
+initialize(iter_t iterations, void* cookie)
+{
+	int procs;
+	struct _state* pState = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	initialize_overhead(iterations, cookie);
+
+	pState->pids = (pid_t*)malloc(pState->procs * sizeof(pid_t));
+	if (pState->pids == NULL)
+		exit(1);
+	bzero((void*)pState->pids, pState->procs * sizeof(pid_t));
+	procs = create_daemons(pState->p, pState->pids, 
+			       pState->procs, pState->process_size);
+	if (procs < pState->procs) {
+		cleanup(0, cookie);
+		exit(1);
+	}
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	int i;
+	struct _state* pState = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	/*
+	 * Close the pipes and kill the children.
+	 */
+	cleanup_overhead(iterations, cookie);
+     	for (i = 1; pState->pids && i < pState->procs; ++i) {
+		if (pState->pids[i] > 0) {
+			kill(pState->pids[i], SIGKILL);
+			waitpid(pState->pids[i], NULL, 0);
+		}
+	}
+	if (pState->pids)
+		free(pState->pids);
+	pState->pids = NULL;
+}
+
+void
+benchmark(iter_t iterations, void* cookie)
+{
+	struct _state* pState = (struct _state*)cookie;
+	int	msg;
+
+	/*
+	 * Main process - all others should be ready to roll, time the
+	 * loop.
+	 */
+	while (iterations-- > 0) {
+		if (write(pState->p[0][1], &msg, sizeof(msg)) !=
+		    sizeof(msg)) {
+			/* perror("read/write on pipe"); */
+			exit(1);
+		}
+		if (read(pState->p[pState->procs-1][0], &msg, sizeof(msg)) != sizeof(msg)) {
+			/* perror("read/write on pipe"); */
+			exit(1);
+		}
+		bread(pState->data, pState->process_size);
+	}
+}
+
+
+void
+doit(int rd, int wr, int process_size)
+{
+	int	msg;
+	void*	data = NULL;
+
+	if (process_size) {
+		data = malloc(process_size);
+		if (data) bzero(data, process_size);
+	}
+	for ( ;; ) {
+		if (read(rd, &msg, sizeof(msg)) != sizeof(msg)) {
+			/* perror("read/write on pipe"); */
+			break;
+		}
+		bread(data, process_size);
+		if (write(wr, &msg, sizeof(msg)) != sizeof(msg)) {
+			/* perror("read/write on pipe"); */
+			break;
+		}
+	}
+	exit(1);
+}
+
+
+int
+create_daemons(int **p, pid_t *pids, int procs, int process_size)
+{
+	int	i, j;
+	int	msg;
+
+	/*
+	 * Use the pipes as a ring, and fork off a bunch of processes
+	 * to pass the byte through their part of the ring.
+	 *
+	 * Do the sum in each process and get that time before moving on.
+	 */
+	handle_scheduler(benchmp_childid(), 0, procs-1);
+     	for (i = 1; i < procs; ++i) {
+		switch (pids[i] = fork()) {
+		    case -1:	/* could not fork, out of processes? */
+			return i;
+
+		    case 0:	/* child */
+			handle_scheduler(benchmp_childid(), i, procs-1);
+			for (j = 0; j < procs; ++j) {
+				if (j != i - 1) close(p[j][0]);
+				if (j != i) close(p[j][1]);
+			}
+			doit(p[i-1][0], p[i][1], process_size);
+			/* NOTREACHED */
+
+		    default:	/* parent */
+			;
+	    	}
+	}
+
+	/*
+	 * Go once around the loop to make sure that everyone is ready and
+	 * to get the token in the pipeline.
+	 */
+	if (write(p[0][1], &msg, sizeof(msg)) != sizeof(msg) ||
+	    read(p[procs-1][0], &msg, sizeof(msg)) != sizeof(msg)) {
+		/* perror("write/read/write on pipe"); */
+		exit(1);
+	}
+	return procs;
+}
+
+int
+create_pipes(int **p, int procs)
+{
+	int	i;
+	/*
+	 * Get a bunch of pipes.
+	 */
+	morefds();
+     	for (i = 0; i < procs; ++i) {
+		if (pipe(p[i]) == -1) {
+			return i;
+		}
+	}
+	return procs;
+}
diff --git a/performance/lmbench3/src/lat_dram_page.c b/performance/lmbench3/src/lat_dram_page.c
new file mode 100644
index 0000000..250af78
--- /dev/null
+++ b/performance/lmbench3/src/lat_dram_page.c
@@ -0,0 +1,201 @@
+/*
+ * lat_dram_page.c - guess the DRAM page latency
+ *
+ * usage: lat_dram_page
+ *
+ * Copyright (c) 2002 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+void	dram_page_initialize(iter_t iterations, void* cookie);
+void	benchmark_loads(iter_t iterations, void *cookie);
+double	loads(benchmp_f initialize, int len, int warmup, int repetitions, void* cookie);
+
+struct dram_page_state
+{
+	struct mem_state	mstate;
+	int			group;
+};
+
+int
+main(int ac, char **av)
+{
+	int	i, j, l;
+	int	verbose = 0;
+	int	maxlen = 64 * 1024 * 1024;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int	c;
+	struct dram_page_state state;
+	double	dram_hit, dram_miss;
+	char   *usage = "[-v] [-W <warmup>] [-N <repetitions>][-M len[K|M]]\n";
+
+	state.mstate.width = 1;
+	state.mstate.line = sizeof(char*);
+	state.mstate.pagesize = getpagesize();
+	state.group = 16;
+
+	while (( c = getopt(ac, av, "avL:T:M:W:N:")) != EOF) {
+		switch(c) {
+		case 'v':
+			verbose = 1;
+			break;
+		case 'L':
+			state.mstate.line = bytes(optarg);
+			break;
+		case 'T':
+			state.group = bytes(optarg);
+			break;
+		case 'M':
+			maxlen = bytes(optarg);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	dram_hit = loads(mem_initialize, maxlen, warmup, repetitions, &state);
+	dram_miss = loads(dram_page_initialize, maxlen, warmup, repetitions, &state);
+
+	if (dram_hit < 0.95 * dram_miss) {
+		fprintf(stderr, "%f\n", dram_miss - dram_hit);
+	} else {
+		fprintf(stderr, "0.0\n");
+	}
+
+	return (0);
+}
+
+#define	ONE	p = (char **)*p;
+#define	FIVE	ONE ONE ONE ONE ONE
+#define	TEN	FIVE FIVE
+#define	FIFTY	TEN TEN TEN TEN TEN
+#define	HUNDRED	FIFTY FIFTY
+
+void
+benchmark_loads(iter_t iterations, void *cookie)
+{
+	struct mem_state* state = (struct mem_state*)cookie;
+	register char **p = (char**)state->base;
+	register int i;
+	register int count = state->len / (state->line * 100) + 1;
+
+	while (iterations-- > 0) {
+		for (i = 0; i < count; ++i) {
+			HUNDRED;
+		}
+	}
+
+	use_pointer((void *)p);
+}
+
+void
+regroup(size_t* pages, int groupsize, void* cookie)
+{
+	register int i, j;
+	register char* ptr;
+	register char *page;
+	register char *page_end;
+	register char *p = 0 /* lint */;
+	struct mem_state* state = (struct mem_state*)cookie;
+
+	if (groupsize <= 1) return;
+
+	p = state->base;
+
+	/*
+	 * for all but the last page in the group,
+	 * point to the same line in the next page
+	 */
+	for (i = 0; i < groupsize - 1; ++i) {
+		for (j = 0; j < state->pagesize; j += sizeof(char*)) {
+			*(char**)(p + pages[i] + j) = p + pages[i+1] + j;
+		}
+	}
+	
+	/*
+	 * for the last page, point to the next line
+	 * in the first page of the group, except for
+	 * the last line in the page which points to
+	 * the first line in the next group
+	 *
+	 * since the pointers are all set up for the
+	 * last line, only modify the pointers for
+	 * the other lines
+	 */
+	page = p + pages[groupsize-1];
+	page_end = page + state->pagesize;
+	for (i = 0; i < state->pagesize; i += sizeof(char*)) {
+		ptr = *(char**)(page + i);
+		if (page <= ptr && ptr < page_end) {
+			int offset = (int)(ptr - page);
+			*(char**)(page + i) = p + pages[0] + offset;
+		}
+	}
+}
+
+/*
+ * This is like mem_initialize
+ */
+void
+dram_page_initialize(iter_t iterations, void* cookie)
+{
+	int i;
+	struct mem_state* state = (struct mem_state*)cookie;
+	struct dram_page_state*	dstate = (struct dram_page_state*)cookie;
+
+	if (iterations) return; 
+
+	mem_initialize(iterations, cookie);
+
+	for (i = 0; i < state->npages; i += dstate->group) {
+		int	groupsize = dstate->group;
+		if (groupsize > state->npages - i) {
+			groupsize = state->npages - i;
+		}
+		regroup(state->pages + i, groupsize, cookie);
+	}
+
+	benchmark_loads(1, cookie);
+}
+
+double
+loads(benchmp_f initialize, int len, int warmup, int repetitions, void* cookie)
+{
+	double result;
+	int count;
+	int parallel = 1;
+	struct mem_state* state = (struct mem_state*)cookie;
+
+	state->len = len;
+	state->maxlen = len;
+	count = 100 * (state->len / (state->line * 100) + 1);
+
+	/*
+	 * Now walk them and time it.
+	 */
+	benchmp(initialize, benchmark_loads, mem_cleanup, 
+		0, parallel, warmup, repetitions, cookie);
+
+	/* We want to get to nanoseconds / load. */
+	result = (1000. * (double)gettime()) / (double)(count * get_n());
+	/*
+	fprintf(stderr, "%.5f %.3f\n", len / (1024. * 1024.), result);
+	/**/
+
+	return result;
+}
diff --git a/performance/lmbench3/src/lat_fcntl.c b/performance/lmbench3/src/lat_fcntl.c
new file mode 100644
index 0000000..bfe9e7f
--- /dev/null
+++ b/performance/lmbench3/src/lat_fcntl.c
@@ -0,0 +1,224 @@
+#include "bench.h"
+
+/*
+ * lat_fcntl.c - file locking test
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id: lat_pipe.c,v 1.8 1997/06/16 05:38:58 lm Exp $\n";
+
+#include "bench.h"
+
+struct	flock lock, unlock;
+struct	flock s1, s2;
+
+/*
+ * Create two files, use them as a ping pong test.
+ * Process A:
+ *	lock(1)
+ *	unlock(2)
+ * Process B:
+ *	unlock(1)
+ *	lock(2)
+ * Initial state:
+ *	lock is locked
+ *	lock2 is locked
+ */
+
+#define	waiton(fd)	fcntl(fd, F_SETLKW, &lock)
+#define	release(fd)	fcntl(fd, F_SETLK, &unlock)
+
+struct _state {
+	char filename1[2048];
+	char filename2[2048];
+	int	pid;
+	int	fd1;
+	int	fd2;
+};
+
+void initialize(iter_t iterations, void* cookie);
+void benchmark(iter_t iterations, void* cookie);
+void cleanup(iter_t iterations, void* cookie);
+
+void
+procA(struct _state *state)
+{
+	if (waiton(state->fd1) == -1) {
+		perror("lock of fd1 failed\n");
+		cleanup(0, state);
+		exit(1);
+	}
+	if (release(state->fd2) == -1) {
+		perror("unlock of fd2 failed\n");
+		cleanup(0, state);
+		exit(1);
+	}
+	if (waiton(state->fd2) == -1) {
+		perror("lock of fd2 failed\n");
+		cleanup(0, state);
+		exit(1);
+	}
+	if (release(state->fd1) == -1) {
+		perror("unlock of fd1 failed\n");
+		cleanup(0, state);
+		exit(1);
+	}
+}
+
+void
+procB(struct _state *state)
+{
+	if (release(state->fd1) == -1) {
+		perror("unlock of fd1 failed\n");
+		cleanup(0, state);
+		exit(1);
+	}
+	if (waiton(state->fd2) == -1) {
+		perror("lock of fd2 failed\n");
+		cleanup(0, state);
+		exit(1);
+	}
+	if (release(state->fd2) == -1) {
+		perror("unlock of fd2 failed\n");
+		cleanup(0, state);
+		exit(1);
+	}
+	if (waiton(state->fd1) == -1) {
+		perror("lock of fd1 failed\n");
+		cleanup(0, state);
+		exit(1);
+	}
+}
+
+void 
+initialize(iter_t iterations, void* cookie)
+{
+	char	buf[10000];
+	struct _state* state = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	sprintf(state->filename1, "/tmp/lmbench-fcntl%d.1", getpid());
+	sprintf(state->filename2, "/tmp/lmbench-fcntl%d.2", getpid());
+	state->pid = 0;
+	state->fd1 = -1;
+	state->fd2 = -1;
+
+	unlink(state->filename1);
+	unlink(state->filename2);
+	if ((state->fd1 = open(state->filename1, O_CREAT|O_RDWR, 0666)) == -1) {
+		perror("create");
+		exit(1);
+	}
+	if ((state->fd2 = open(state->filename2, O_CREAT|O_RDWR, 0666)) == -1) {
+		perror("create");
+		exit(1);
+	}
+	unlink(state->filename1);
+	unlink(state->filename2);
+	write(state->fd1, buf, sizeof(buf));
+	write(state->fd2, buf, sizeof(buf));
+	lock.l_type = F_WRLCK;
+	lock.l_whence = 0;
+	lock.l_start = 0;
+	lock.l_len = 1;
+	unlock = lock;
+	unlock.l_type = F_UNLCK;
+	if (waiton(state->fd1) == -1) {
+		perror("lock1");
+		exit(1);
+	}
+	if (waiton(state->fd2) == -1) {
+		perror("lock2");
+		exit(1);
+	}
+	handle_scheduler(benchmp_childid(), 0, 1);
+	switch (state->pid = fork()) {
+	case -1:
+		perror("fork");
+		exit(1);
+	case 0:
+		handle_scheduler(benchmp_childid(), 1, 1);
+		for ( ;; ) {
+			procB(state);
+		}
+		exit(0);
+	default:
+		break;
+	}
+}
+
+void
+benchmark(iter_t iterations, void* cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+	
+	while (iterations-- > 0) {
+		procA(state);
+	}
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	int i;
+	struct _state* state = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	if (state->fd1 >= 0) close(state->fd1);
+	if (state->fd2 >= 0) close(state->fd2);
+	state->fd1 = -1;
+	state->fd2 = -1;
+
+	if (state->pid) {
+		kill(state->pid, SIGKILL);
+		waitpid(state->pid, NULL, 0);
+	}
+	state->pid = 0;
+}
+
+int
+main(int ac, char **av)
+{
+	int	i;
+	int	c;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	struct _state state;
+	char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n";
+
+	/*
+	 * If they specified a parallelism level, get it.
+	 */
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	state.pid = 0;
+
+	benchmp(initialize, benchmark, cleanup, 0, parallel, 
+		warmup, repetitions, &state);
+	micro("Fcntl lock latency", 2 * get_n());
+
+	return (0);
+}
diff --git a/performance/lmbench3/src/lat_fifo.c b/performance/lmbench3/src/lat_fifo.c
new file mode 100644
index 0000000..e3f69c4
--- /dev/null
+++ b/performance/lmbench3/src/lat_fifo.c
@@ -0,0 +1,165 @@
+/*
+ * lat_fifo.c - named pipe transaction test
+ *
+ * usage: lat_fifo [-P <parallelism>] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+#define	F1	"/tmp/lmbench_f1.%d"
+#define	F2	"/tmp/lmbench_f2.%d"
+
+void initialize(iter_t iterations, void *cookie);
+void cleanup(iter_t iterations, void *cookie);
+void doit(iter_t iterations, void *cookie);
+void writer(int wr, int rd);
+
+typedef struct _state {
+	char	filename1[256];
+	char	filename2[256];
+	int	pid;
+	int	wr;
+	int	rd;
+} state_t;
+
+int 
+main(int ac, char **av)
+{
+	state_t state;
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n";
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind < ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.pid = 0;
+
+	benchmp(initialize, doit, cleanup, SHORT, parallel, 
+		warmup, repetitions, &state);
+	micro("Fifo latency", get_n());
+	return (0);
+}
+
+void 
+initialize(iter_t iterations, void *cookie)
+{
+	char	c;
+	state_t * state = (state_t *)cookie;
+
+	if (iterations) return;
+
+	state->pid = 0;
+	sprintf(state->filename1,F1,getpid());
+	sprintf(state->filename2,F2,getpid());
+	
+	unlink(state->filename1); unlink(state->filename2);
+	if (mknod(state->filename1, S_IFIFO|0664, 0) ||
+	    mknod(state->filename2, S_IFIFO|0664, 0)) {
+		perror("mknod");
+		exit(1);
+	}
+	handle_scheduler(benchmp_childid(), 0, 1);
+	switch (state->pid = fork()) {
+	    case 0:
+		handle_scheduler(benchmp_childid(), 1, 1);
+		state->rd = open(state->filename1, O_RDONLY);
+		state->wr = open(state->filename2, O_WRONLY);
+		writer(state->wr, state->rd);
+		return;
+
+	    case -1:
+		perror("fork");
+		return;
+
+	    default:
+		state->wr = open(state->filename1, O_WRONLY);
+		state->rd = open(state->filename2, O_RDONLY);
+		break;
+	}
+
+	/*
+	 * One time around to make sure both processes are started.
+	 */
+	if (write(state->wr, &c, 1) != 1 || read(state->rd, &c, 1) != 1) {
+		perror("(i) read/write on pipe");
+		exit(1);
+	}
+}
+
+void 
+cleanup(iter_t iterations, void * cookie)
+{
+	state_t * state = (state_t *)cookie;
+
+	if (iterations) return;
+
+	unlink(state->filename1);
+	unlink(state->filename2);
+	close(state->wr);
+	close(state->rd);
+
+	if (state->pid > 0) {
+		kill(state->pid, 15);
+		waitpid(state->pid, NULL, 0);
+		state->pid = 0;
+	}
+}
+
+void 
+doit(register iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+	char		c;
+	register int	w = state->wr;
+	register int	r = state->rd;
+	register char	*cptr = &c;
+
+	while (iterations-- > 0) {
+		if (write(w, cptr, 1) != 1 ||
+		    read(r, cptr, 1) != 1) {
+			perror("(r) read/write on pipe");
+			exit(1);
+		}
+	}
+}
+
+void 
+writer(register int w, register int r)
+{
+	char		c;
+	register char	*cptr = &c;
+
+	for ( ;; ) {
+		if (read(r, cptr, 1) != 1 ||
+			write(w, cptr, 1) != 1) {
+			    perror("(w) read/write on pipe");
+		}
+	}
+}
diff --git a/performance/lmbench3/src/lat_fs.c b/performance/lmbench3/src/lat_fs.c
new file mode 100644
index 0000000..0dfafb9
--- /dev/null
+++ b/performance/lmbench3/src/lat_fs.c
@@ -0,0 +1,272 @@
+/*
+ * Benchmark creates & deletes.
+ */
+
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+
+struct _state {
+	char	*tmpdir;
+	int	max;
+	int	n;
+	char**	names;
+	int	ndirs;
+	char**	dirs;
+	size_t	size;
+};
+void	measure(size_t size, 
+		int parallel, int warmup, int repetitions, void* cookie);
+void	mkfile(char* s, size_t size);
+void	setup_names(iter_t iterations, void* cookie);
+void	cleanup_names(iter_t iterations, void* cookie);
+void	setup_rm(iter_t iterations, void* cookie);
+void	cleanup_mk(iter_t iterations, void* cookie);
+void	benchmark_mk(iter_t iterations, void* cookie);
+void	benchmark_rm(iter_t iterations, void* cookie);
+
+int
+main(int ac, char **av)
+{
+	int i;
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	static	int	sizes[] = { 0, 1024, 4096, 10*1024 };
+	struct _state state;
+	int c;
+	char* usage = "[-s <file size>] [-n <max files per dir>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] [<dir>]\n";
+
+	state.size = 0;
+	state.max = 100;
+	state.tmpdir = NULL;
+
+	while (( c = getopt(ac, av, "s:n:P:W:N:")) != EOF) {
+		switch(c) {
+		case 's':
+			state.size = bytes(optarg);
+			break;
+		case 'n':
+			state.max = bytes(optarg);
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind < ac - 1) {
+		lmbench_usage(ac, av, usage);
+	}
+	if (optind == ac - 1) {
+		state.tmpdir = av[1];
+	}
+
+	if (state.size) {
+		measure(state.size, parallel, warmup, repetitions, &state);
+	} else {
+		for (i = 0; i < sizeof(sizes)/sizeof(int); ++i) {
+			state.size = sizes[i];
+			measure(state.size, 
+				parallel, warmup, repetitions, &state);
+		}
+	}
+	return(0);
+}
+
+void
+measure(size_t size, int parallel, int warmup, int repetitions, void* cookie)
+{
+	fprintf(stderr, "%luk", size>>10);
+	benchmp(setup_names, benchmark_mk, cleanup_mk, 0, parallel,
+		warmup, repetitions, cookie);
+	if (gettime()) {
+		fprintf(stderr, "\t%lu\t%.0f", (unsigned long)get_n(), 
+			(double)(1000000. * get_n() / (double)gettime()));
+	} else {
+		fprintf(stderr, "\t-1\t-1");
+	}
+
+	benchmp(setup_rm, benchmark_rm, cleanup_names, 0, parallel,
+		warmup, repetitions, cookie);
+	if (gettime()) {
+		fprintf(stderr, "\t%.0f", 
+			(double)(1000000. * get_n() / (double)gettime()));
+	} else {
+		fprintf(stderr, "\t-1");
+	}
+	fprintf(stderr, "\n");
+}
+
+void
+mkfile(char *name, size_t size)
+{
+	size_t	chunk;
+	int	fd = creat(name, 0666);
+	char	buf[128*1024];		/* XXX - track sizes */
+
+	while (size > 0) {
+		chunk = ((size > (128*1024)) ? (128*1024) : size);
+		write(fd, buf, chunk);
+		size -= chunk;
+	}
+	close(fd);
+}
+
+void
+setup_names_recurse(iter_t* foff, iter_t* doff, int depth, struct _state* state)
+{
+	long	i, ndirs, count;
+	char*	basename = state->dirs[*doff];
+	char	name[L_tmpnam + 8192];
+
+	if (depth > 0) {
+		for (count = state->max, i = 1; i < depth; ++i) {
+			count *= state->max;
+		}
+		ndirs = (state->n - *foff) / count + 1;
+		for (i = 0; i < state->max && i < ndirs && *foff < state->n; ++i) {
+			sprintf(name, "%s/%ld", basename, i);
+			state->dirs[++(*doff)] = strdup(name);
+			mkdir(name, 0777);
+			setup_names_recurse(foff, doff, depth-1, state);
+		}
+	} else {
+		for (i = 0; i < state->max && *foff < state->n; ++i) {
+			sprintf(name, "%s/%ld", basename, i);
+			state->names[(*foff)++] = strdup(name);
+		}
+	}
+}
+
+void
+setup_names(iter_t iterations, void* cookie)
+{
+	long	i, ndirs, depth;
+	iter_t	foff;
+	iter_t	doff;
+	char	dirname_tmpl[L_tmpnam + 256];
+	char*	dirname;
+	struct _state* state = (struct _state*)cookie;
+
+	if (!iterations) return;
+
+	depth = 0;
+	state->n = iterations;
+	state->ndirs = iterations / state->max;
+	if (iterations % state->max) state->ndirs++;
+	for (ndirs = state->ndirs; ndirs > 1; ) {
+		ndirs = ndirs / state->max + ((ndirs % state->max) ? 1 : 0);
+		state->ndirs += ndirs;
+		depth++;
+	}
+
+	state->names = (char**)malloc(iterations * sizeof(char*));
+	for (i = 0; i < iterations; ++i) {
+		state->names[i] = NULL;
+	}
+
+	state->dirs = (char**)malloc(state->ndirs * sizeof(char*));
+	for (i = 0; i < state->ndirs; ++i) {
+		state->dirs[i] = NULL;
+	}
+
+	sprintf(dirname_tmpl, "lat_fs_%d_XXXXXX", getpid());
+	dirname = tempnam(state->tmpdir, dirname_tmpl);
+	if (!dirname) {
+		perror("tempnam failed");
+		exit(1);
+	}
+	if (mkdir(dirname, S_IRUSR|S_IWUSR|S_IXUSR)) {
+		perror("mkdir failed");
+		exit(1);
+	}
+	state->dirs[0] = dirname;
+	foff = 0;
+	doff = 0;
+	setup_names_recurse(&foff, &doff, depth, state);
+	if (foff != iterations || doff != state->ndirs - 1) {
+		fprintf(stderr, "setup_names: ERROR: foff=%lu, iterations=%lu, doff=%lu, ndirs=%lu, depth=%d\n", (unsigned long)foff, (unsigned long)iterations, (unsigned long)doff, (unsigned long)state->ndirs, depth);
+	}
+}
+
+void
+cleanup_names(iter_t iterations, void* cookie)
+{
+	long	i;
+	struct _state* state = (struct _state*)cookie;
+
+	if (!iterations) return;
+
+	for (i = 0; i < state->n; ++i) {
+		if (state->names[i]) free(state->names[i]);
+	}
+	free(state->names);
+	state->n = 0;
+
+	for (i = state->ndirs - 1; i >= 0; --i) {
+		if (state->dirs[i]) {
+			rmdir(state->dirs[i]);
+			free(state->dirs[i]);
+		}
+	}
+	free(state->dirs);
+	state->ndirs = 0;
+}
+
+void
+setup_rm(iter_t iterations, void* cookie)
+{
+	if (!iterations) return;
+
+	setup_names(iterations, cookie);
+	benchmark_mk(iterations, cookie);
+}
+
+void
+cleanup_mk(iter_t iterations, void* cookie)
+{
+	if (!iterations) return;
+
+	benchmark_rm(iterations, cookie);
+	cleanup_names(iterations, cookie);
+}
+
+void
+benchmark_mk(iter_t iterations, void* cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		if (!state->names[iterations]) {
+			fprintf(stderr, "benchmark_mk: null filename at %lu of %lu\n", iterations, state->n);
+			continue;
+		}
+		mkfile(state->names[iterations], state->size);
+	}
+}
+
+void
+benchmark_rm(iter_t iterations, void* cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		if (!state->names[iterations]) {
+			fprintf(stderr, "benchmark_rm: null filename at %lu of %lu\n", iterations, state->n);
+			continue;
+		}
+		unlink(state->names[iterations]);
+	}
+}
+
diff --git a/performance/lmbench3/src/lat_http.c b/performance/lmbench3/src/lat_http.c
new file mode 100644
index 0000000..77e6f38
--- /dev/null
+++ b/performance/lmbench3/src/lat_http.c
@@ -0,0 +1,128 @@
+/*
+ * lat_http.c - simple HTTP transaction latency test
+ *
+ * usage: lat_http hostname [port] < filelist
+ *
+ * Copyright (c) 1994-6 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+char	*buf;
+int	debug;
+int	echo;
+
+int
+http(char *server, char *file, int prog)
+{
+	int     sock;
+	int     n;
+	int	b = 0;
+
+	sock = tcp_connect(server, prog, SOCKOPT_REUSE);
+	sprintf(buf, "GET /%s HTTP/1.0\r\n\r\n\n", file);
+	if (debug) {
+		printf(buf);
+	}
+	write(sock, buf, strlen(buf));
+	while ((n = read(sock, buf, XFERSIZE)) > 0) {
+		b += n;
+		if (echo) {
+			write(1, buf, n);
+		}
+	}
+	close(sock);
+	if (debug) {
+		printf("Got %d\n", b);
+	}
+	return (b);
+}
+
+void
+killhttp(char *server, int prog)
+{
+	int     sock;
+
+	sock = tcp_connect(server, prog, SOCKOPT_REUSE);
+	write(sock, "EXIT", 4);
+	close(sock);
+}
+
+void chop(register char *s) { while (*s && *s != '\n') s++; *s = 0; }
+
+int
+main(int ac, char **av)
+{
+	char	*server;
+	int     i, prog;
+	int	c;
+	int	shutdown = 0;
+	uint64	total = 0;
+	uint64	usecs = 0;
+	double	avg;
+	char	*name = av[0];
+	char	file[1024];
+	char	*usage = "[-d] [-e] [-S] serverhost [port] < list\n";
+
+	while (( c = getopt(ac, av, "deS")) != EOF) {
+		switch(c) {
+		case 'd':
+			debug++;
+			break;
+		case 'e':
+			echo++;
+			break;
+		case 'S': /* shutdown serverhost */
+			shutdown = 1;
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	
+	if (optind >= ac || optind < ac - 2) {
+		lmbench_usage(ac, av, usage);
+		exit(0);
+	}
+	server = av[optind++];
+
+	if (optind < ac && atoi(av[optind]) != 0) {
+		prog = -atoi(av[optind]);
+	} else {
+		prog = -80;
+	}
+
+	if (shutdown) {
+		killhttp(server, prog);
+		exit(0);
+	}
+
+	i = 0;
+	buf = valloc(XFERSIZE);
+	bzero(buf, XFERSIZE);
+	while (fgets(file, sizeof(file), stdin)) {
+		chop(file);
+		start(0);
+		total += http(server, file, prog);
+		usecs += stop(0,0);
+		i++;
+	}
+	avg = total;
+	avg /= (i - 1);
+	if (avg > 1000) {
+		avg /= 1000;
+		fprintf(stderr, "Avg xfer: %.1fKB, ", avg);
+	} else {
+		fprintf(stderr, "Avg xfer %d, ", (int)avg);
+	}
+	settime(usecs);
+	latency((uint64)1, total);
+	exit(0);
+}
+
diff --git a/performance/lmbench3/src/lat_mem_rd.c b/performance/lmbench3/src/lat_mem_rd.c
new file mode 100644
index 0000000..e56e458
--- /dev/null
+++ b/performance/lmbench3/src/lat_mem_rd.c
@@ -0,0 +1,169 @@
+/*
+ * lat_mem_rd.c - measure memory load latency
+ *
+ * usage: lat_mem_rd [-P <parallelism>] [-W <warmup>] [-N <repetitions>] [-t] size-in-MB [stride ...]
+ *
+ * Copyright (c) 1994 Larry McVoy.  
+ * Copyright (c) 2003, 2004 Carl Staelin.
+ *
+ * Distributed under the FSF GPL with additional restriction that results 
+ * may published only if:
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id: s.lat_mem_rd.c 1.13 98/06/30 16:13:49-07:00 lm@xxxxxxxxxxxxxxx $\n";
+
+#include "bench.h"
+#define STRIDE  (512/sizeof(char *))
+#define	LOWER	512
+void	loads(size_t len, size_t range, size_t stride, 
+	      int parallel, int warmup, int repetitions);
+size_t	step(size_t k);
+void	initialize(iter_t iterations, void* cookie);
+
+benchmp_f	fpInit = stride_initialize;
+
+int
+main(int ac, char **av)
+{
+	int	i;
+	int	c;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+        size_t	len;
+	size_t	range;
+	size_t	stride;
+	char   *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] [-t] len [stride...]\n";
+
+	while (( c = getopt(ac, av, "tP:W:N:")) != EOF) {
+		switch(c) {
+		case 't':
+			fpInit = thrash_initialize;
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind == ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+        len = atoi(av[optind]);
+	len *= 1024 * 1024;
+
+	if (optind == ac - 1) {
+		fprintf(stderr, "\"stride=%d\n", STRIDE);
+		for (range = LOWER; range <= len; range = step(range)) {
+			loads(len, range, STRIDE, parallel, 
+			      warmup, repetitions);
+		}
+	} else {
+		for (i = optind + 1; i < ac; ++i) {
+			stride = bytes(av[i]);
+			fprintf(stderr, "\"stride=%d\n", stride);
+			for (range = LOWER; range <= len; range = step(range)) {
+				loads(len, range, stride, parallel, 
+				      warmup, repetitions);
+			}
+			fprintf(stderr, "\n");
+		}
+	}
+	return(0);
+}
+
+#define	ONE	p = (char **)*p;
+#define	FIVE	ONE ONE ONE ONE ONE
+#define	TEN	FIVE FIVE
+#define	FIFTY	TEN TEN TEN TEN TEN
+#define	HUNDRED	FIFTY FIFTY
+
+
+void
+benchmark_loads(iter_t iterations, void *cookie)
+{
+	struct mem_state* state = (struct mem_state*)cookie;
+	register char **p = (char**)state->p[0];
+	register size_t i;
+	register size_t count = state->len / (state->line * 100) + 1;
+
+	while (iterations-- > 0) {
+		for (i = 0; i < count; ++i) {
+			HUNDRED;
+		}
+	}
+
+	use_pointer((void *)p);
+	state->p[0] = (char*)p;
+}
+
+
+void
+loads(size_t len, size_t range, size_t stride, 
+	int parallel, int warmup, int repetitions)
+{
+	double result;
+	size_t count;
+	struct mem_state state;
+
+	if (range < stride) return;
+
+	state.width = 1;
+	state.len = range;
+	state.maxlen = len;
+	state.line = stride;
+	state.pagesize = getpagesize();
+	count = 100 * (state.len / (state.line * 100) + 1);
+
+#if 0
+	(*fpInit)(0, &state);
+	fprintf(stderr, "loads: after init\n");
+	(*benchmark_loads)(2, &state);
+	fprintf(stderr, "loads: after benchmark\n");
+	mem_cleanup(0, &state);
+	fprintf(stderr, "loads: after cleanup\n");
+	settime(1);
+	save_n(1);
+#else
+	/*
+	 * Now walk them and time it.
+	 */
+	benchmp(fpInit, benchmark_loads, mem_cleanup, 
+		100000, parallel, warmup, repetitions, &state);
+#endif
+
+	/* We want to get to nanoseconds / load. */
+	save_minimum();
+	result = (1000. * (double)gettime()) / (double)(count * get_n());
+	fprintf(stderr, "%.5f %.3f\n", range / (1024. * 1024.), result);
+
+}
+
+size_t
+step(size_t k)
+{
+	if (k < 1024) {
+		k = k * 2;
+        } else if (k < 4*1024) {
+		k += 1024;
+	} else {
+		size_t s;
+
+		for (s = 32 * 1024; s <= k; s *= 2)
+			;
+		k += s / 16;
+	}
+	return (k);
+}
diff --git a/performance/lmbench3/src/lat_mmap.c b/performance/lmbench3/src/lat_mmap.c
new file mode 100644
index 0000000..1a6445b
--- /dev/null
+++ b/performance/lmbench3/src/lat_mmap.c
@@ -0,0 +1,175 @@
+/*
+ * lat_mmap.c - time how fast a mapping can be made and broken down
+ *
+ * Usage: mmap [-r] [-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size file
+ *
+ * XXX - If an implementation did lazy address space mapping, this test
+ * will make that system look very good.  I haven't heard of such a system.
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+#define	PSIZE	(16<<10)
+#define	N	10
+#define	STRIDE	(10*PSIZE)
+#define	MINSIZE	(STRIDE*2)
+
+#define	CHK(x)	if ((x) == -1) { perror("x"); exit(1); }
+
+
+typedef struct _state {
+	size_t	size;
+	int	fd;
+	int	random;
+	int	clone;
+	char	*name;
+} state_t;
+
+void	init(iter_t iterations, void *cookie);
+void	cleanup(iter_t iterations, void *cookie);
+void	domapping(iter_t iterations, void * cookie);
+
+int
+main(int ac, char **av)
+{
+	state_t state;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	char	buf[256];
+	int	c;
+	char	*usage = "[-r] [-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size file\n";
+	
+
+	state.random = 0;
+	state.clone = 0;
+	while (( c = getopt(ac, av, "rP:W:N:C")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0)
+				lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		case 'r':
+			state.random = 1;
+			break;
+		case 'C':
+			state.clone = 1;
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind + 2 != ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.size = bytes(av[optind]);
+	if (state.size < MINSIZE) {
+		return (1);
+	}
+	state.name = av[optind+1];
+
+	benchmp(init, domapping, cleanup, 0, parallel, 
+		warmup, repetitions, &state);
+
+	if (gettime() > 0) {
+		micromb(state.size, get_n());
+	}
+	return (0);
+}
+
+void
+init(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+	
+	if (iterations) return;
+
+	if (state->clone) {
+		char buf[128];
+		char* s;
+
+		/* copy original file into a process-specific one */
+		sprintf(buf, "%d", (int)getpid());
+		s = (char*)malloc(strlen(state->name) + strlen(buf) + 1);
+		sprintf(s, "%s%d", state->name, (int)getpid());
+		if (cp(state->name, s, S_IREAD|S_IWRITE) < 0) {
+			perror("Could not copy file");
+			unlink(s);
+			exit(1);
+		}
+		state->name = s;
+	}
+	CHK(state->fd = open(state->name, O_RDWR));
+	if (state->clone) unlink(state->name);
+	if (lseek(state->fd, 0, SEEK_END) < state->size) {
+		fprintf(stderr, "Input file too small\n");
+		exit(1);
+	}
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	close(state->fd);
+}
+
+/*
+ * This alg due to Linus.  The goal is to have both sparse and full
+ * mappings reported.
+ */
+void
+domapping(iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+	register int fd = state->fd;
+	register size_t size = state->size;
+	register int random = state->random;
+	register char	*p, *where, *end;
+	register char	c = size & 0xff;
+
+	while (iterations-- > 0) {
+
+#ifdef	MAP_FILE
+		where = mmap(0, size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_SHARED, fd, 0);
+#else
+		where = mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+#endif
+		if ((long)where == -1) {
+			perror("mmap");
+			exit(1);
+		}
+		if (random) {
+			end = where + size;
+			for (p = where; p < end; p += STRIDE) {
+				*p = c;
+			}
+		} else {
+			end = where + (size / N);
+			for (p = where; p < end; p += PSIZE) {
+				*p = c;
+			}
+		}
+		munmap(where, size);
+	}
+}
diff --git a/performance/lmbench3/src/lat_ops.c b/performance/lmbench3/src/lat_ops.c
new file mode 100755
index 0000000..a86b449
--- /dev/null
+++ b/performance/lmbench3/src/lat_ops.c
@@ -0,0 +1,485 @@
+/*
+ * lat_ops.c - benchmark of simple operations
+ *
+ * Copyright (c) 1996-2004 Carl Staelin and Larry McVoy.  
+ *
+ * This benchmark is meant to benchmark raw arithmetic operation
+ * latency for various operations on various datatypes.  Obviously,
+ * not all operations make sense for all datatypes (e.g., modulus
+ * on float).  The benchmarks are configured to use interlocking
+ * operations, so we measure the time of an individual operation.
+ * 
+ * The exception to the interlocking operation guidelines are the
+ * vector operations, muladd and bogomflops, for both float and
+ * double data types.  In this case we are trying to determine
+ * how well the CPU can schedule the various arithmetic units
+ * and overlap adjacent operations to get the maximal throughput
+ * from the system.  In addition, we are using relatively short
+ * vectors so these operations should be going to/from L1 (or
+ * possibly L2) cache, rather than main memory, which should
+ * reduce or eliminate the memory overheads.
+ *
+ * The vector operations use a slightly unrolled loop because
+ * this is common in scientific codes that do these sorts of
+ * operations.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+struct _state {
+	int	N;
+	int	M;
+	int	K;
+	double*	data;
+};
+
+#define FIVE(a) a a a a a
+#define TEN(a) a a a a a a a a a a
+#define HUNDRED(a) TEN(TEN(a))
+
+void
+float_initialize(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int i;
+	register float* x;
+
+	if (iterations) return;
+
+	x = (float*)malloc(pState->M * sizeof(float));
+	pState->data = (double*)x;
+	for (i = 0; i < pState->M; ++i) {
+		x[i] = 3.14159265;
+	}
+}
+
+void
+double_initialize(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int i;
+
+	if (iterations) return;
+
+	pState->data = (double*)malloc(pState->M * sizeof(double));
+	for (i = 0; i < pState->M; ++i) {
+		pState->data[i] = 3.14159265;
+	}
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	if (pState->data) 
+		free(pState->data);
+}
+
+void
+do_integer_bitwise(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int r = pState->N;
+	register int s = (int)iterations;
+
+	while (iterations-- > 0) {
+		HUNDRED(r ^= iterations; s ^= r; r |= s;)
+	}
+	use_int(r);
+}
+
+void
+do_integer_add(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int a = pState->N + 57;
+	register int b = pState->N + 31;
+
+	while (iterations-- > 0) {
+		HUNDRED(a += b; b -= a;)
+	}
+	use_int(a+b);
+}
+
+void
+do_integer_mul(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int r = pState->N + 37431;
+	register int s = pState->N + 4;
+	register int t = r * s * s * s * s * s * s * s * s * s * s - r;
+
+	while (iterations-- > 0) {
+		TEN(r *= s;); r -= t;
+		TEN(r *= s;); r -= t;
+	}
+	use_int(r);
+}
+
+void
+do_integer_div(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int r = pState->N + 36;
+	register int s = (r + 1) << 20;
+
+	while (iterations-- > 0) {
+		HUNDRED(r = s / r;)
+	}
+	use_int(r);
+}
+
+void
+do_integer_mod(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int r = pState->N + iterations;
+	register int s = pState->N + 62;
+
+	while (iterations-- > 0) {
+		HUNDRED(r %= s; r |= s;)
+	}
+	use_int(r);
+}
+
+void
+do_int64_bitwise(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int64 r = (int64)pState->N | (int64)pState->N<<32;
+	register int64 s = (int64)iterations | (int64)iterations<<32;
+	register int64 i = (int64)iterations<<34 - 1;
+
+	while (iterations-- > 0) {
+		HUNDRED(r ^= i; s ^= r; r |= s;)
+		i--;
+	}
+	use_int((int)r);
+}
+
+void
+do_int64_add(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int64 a = (int64)pState->N + 37420;
+	register int64 b = (int64)pState->N + 21698324;
+
+	a += (int64)(0xFE + pState->N)<<30;
+	b += (int64)(0xFFFE + pState->N)<<29;
+
+	while (iterations-- > 0) {
+		HUNDRED(a += b; b -= a;)
+	}
+	use_int((int)a+(int)b);
+}
+
+void
+do_int64_mul(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int64 r = (int64)pState->N + 37420;
+	register int64 s = (int64)pState->N + 4;
+	register int64 t;
+
+	r += (int64)(pState->N + 6)<<32;
+	t = r * s * s * s * s * s * s * s * s * s * s - r;
+
+	while (iterations-- > 0) {
+		TEN(r *= s;); r -= t;
+		TEN(r *= s;); r -= t;
+	}
+	use_int((int)r);
+}
+
+void
+do_int64_div(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int64 r = (int64)pState->N + 36;
+	register int64 s;
+
+	r += r << 33;
+	s = (r + 17) << 13;
+
+	while (iterations-- > 0) {
+		HUNDRED(r = s / r;)
+	}
+	use_int((int)r);
+}
+
+void
+do_int64_mod(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int64 r = iterations + (int64)iterations<<32;
+	register int64 s = (int64)pState->N + (int64)pState->N<<56;
+
+	while (iterations-- > 0) {
+		HUNDRED(r %= s; r |= s;);
+	}
+	use_int((int)r);
+}
+
+void
+do_float_add(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register float f = (float)pState->N;
+	register float g = (float)pState->K;
+
+	while (iterations-- > 0) {
+		TEN(f += (float)f;) f += (float)g;
+		TEN(f += (float)f;) f += (float)g;
+	}
+	use_int((int)f);
+	use_int((int)g);
+}
+
+void
+do_float_mul(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register float f = 8.0f * (float)pState->N;
+	register float g = 0.125f * (float)pState->M / 1000.0;
+
+	while (iterations-- > 0) {
+		TEN(f *= f; f *= g;);
+		TEN(f *= f; f *= g;);
+	}
+	use_int((int)f);
+	use_int((int)g);
+}
+
+void
+do_float_div(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register float f = 1.41421356f * (float)pState->N;
+	register float g = 3.14159265f * (float)pState->M / 1000.0;
+
+	while (iterations-- > 0) {
+		FIVE(TEN(f = g / f;) TEN(g = f / g;))
+	}
+	use_int((int)f);
+	use_int((int)g);
+}
+
+void
+do_double_add(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register double f = (double)pState->N;
+	register double g = (double)pState->K;
+
+	while (iterations-- > 0) {
+		TEN(f += (double)f;) f += (double)g;
+		TEN(f += (double)f;) f += (double)g;
+	}
+	use_int((int)f);
+	use_int((int)g);
+}
+
+void
+do_double_mul(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register double f = 8.0 * (double)pState->N;
+	register double g = 0.125 * (double)pState->M / 1000.0;
+
+	while (iterations-- > 0) {
+		TEN(f *= f; f *= g;)
+		TEN(f *= f; f *= g;)
+	}
+	use_int((int)f);
+	use_int((int)g);
+}
+
+void
+do_double_div(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register double f = 1.41421356 * (double)pState->N;
+	register double g = 3.14159265 * (double)pState->M / 1000.0;
+
+	while (iterations-- > 0) {
+		FIVE(TEN(f = g / f;) TEN(g = f / g;))
+	}
+	use_int((int)f);
+	use_int((int)g);
+}
+
+void
+do_float_bogomflops(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int i;
+	register int M = pState->M / 10;
+
+	while (iterations-- > 0) {
+		register float *x = (float*)pState->data;
+		for (i = 0; i < M; ++i) {
+			x[0] = (1.0f + x[0]) * (1.5f - x[0]) / x[0];
+			x[1] = (1.0f + x[1]) * (1.5f - x[1]) / x[1];
+			x[2] = (1.0f + x[2]) * (1.5f - x[2]) / x[2];
+			x[3] = (1.0f + x[3]) * (1.5f - x[3]) / x[3];
+			x[4] = (1.0f + x[4]) * (1.5f - x[4]) / x[4];
+			x[5] = (1.0f + x[5]) * (1.5f - x[5]) / x[5];
+			x[6] = (1.0f + x[6]) * (1.5f - x[6]) / x[6];
+			x[7] = (1.0f + x[7]) * (1.5f - x[7]) / x[7];
+			x[8] = (1.0f + x[8]) * (1.5f - x[8]) / x[8];
+			x[9] = (1.0f + x[9]) * (1.5f - x[9]) / x[9];
+			x += 10;
+		}
+	}
+}
+
+void
+do_double_bogomflops(iter_t iterations, void* cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	register int i;
+	register int M = pState->M / 10;
+
+	while (iterations-- > 0) {
+		register double *x = (double*)pState->data;
+		for (i = 0; i < M; ++i) {
+			x[0] = (1.0f + x[0]) * (1.5f - x[0]) / x[0];
+			x[1] = (1.0f + x[1]) * (1.5f - x[1]) / x[1];
+			x[2] = (1.0f + x[2]) * (1.5f - x[2]) / x[2];
+			x[3] = (1.0f + x[3]) * (1.5f - x[3]) / x[3];
+			x[4] = (1.0f + x[4]) * (1.5f - x[4]) / x[4];
+			x[5] = (1.0f + x[5]) * (1.5f - x[5]) / x[5];
+			x[6] = (1.0f + x[6]) * (1.5f - x[6]) / x[6];
+			x[7] = (1.0f + x[7]) * (1.5f - x[7]) / x[7];
+			x[8] = (1.0f + x[8]) * (1.5f - x[8]) / x[8];
+			x[9] = (1.0f + x[9]) * (1.5f - x[9]) / x[9];
+			x += 10;
+		}
+	}
+}
+
+int
+main(int ac, char **av)
+{
+	int	__n = 1;
+	int	c, i, j;
+	int	warmup = 0;
+	int	parallel = 1;
+	int	repetitions = TRIES;
+	uint64	iop_time;
+	uint64	iop_N;
+	struct _state state;
+	char   *usage = "[-W <warmup>] [-N <repetitions>] [-P <parallel>] \n";
+
+	state.N = 1;
+	state.M = 1000;
+	state.K = -1023;
+	state.data = NULL;
+
+	while (( c = getopt(ac, av, "W:N:P:")) != EOF) {
+		switch(c) {
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	benchmp(NULL, do_integer_bitwise, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("integer bit", get_n() * 100 * 3);
+	
+	benchmp(NULL, do_integer_add, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("integer add", get_n() * 100 * 2);
+	iop_time = gettime();
+	iop_N = get_n() * 100 * 2;
+	
+	benchmp(NULL, do_integer_mul, NULL, 
+		0, 1, warmup, repetitions, &state);
+	settime(gettime() - (get_n() * 2 * iop_time) / iop_N);
+	nano("integer mul", get_n() * 10 * 2);
+	
+	benchmp(NULL, do_integer_div, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("integer div", get_n() * 100);
+	
+	benchmp(NULL, do_integer_mod, NULL, 
+		0, 1, warmup, repetitions, &state);
+	settime(gettime() - (get_n() *  100 * iop_time) / iop_N);
+	nano("integer mod", get_n() * 100);
+	
+	benchmp(NULL, do_int64_bitwise, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("int64 bit", get_n() * 100 * 3);
+	iop_time = gettime();
+	iop_N = get_n() * 100 * 3;
+
+	benchmp(NULL, do_int64_add, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("int64 add", get_n() * 100 * 2);
+	
+	benchmp(NULL, do_int64_mul, NULL, 
+		0, 1, warmup, repetitions, &state);
+	settime(gettime() - (get_n() * 2 * iop_time) / iop_N);
+	nano("int64 mul", get_n() * 10 * 2);
+	
+	benchmp(NULL, do_int64_div, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("int64 div", get_n() * 100);
+	
+	benchmp(NULL, do_int64_mod, NULL, 
+		0, 1, warmup, repetitions, &state);
+	settime(gettime() - (get_n() * 100 * iop_time) / iop_N);
+	nano("int64 mod", get_n() * 100);
+	
+	benchmp(NULL, do_float_add, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("float add", get_n() * (10 + 1) * 2);
+	
+	benchmp(NULL, do_float_mul, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("float mul", get_n() * 10 * 2 * 2);
+	
+	benchmp(NULL, do_float_div, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("float div", get_n() * 100);
+
+	benchmp(NULL, do_double_add, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("double add", get_n() * (10 + 1) * 2);
+	
+	benchmp(NULL, do_double_mul, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("double mul", get_n() * 10 * 2 * 2);
+	
+	benchmp(NULL, do_double_div, NULL, 
+		0, 1, warmup, repetitions, &state);
+	nano("double div", get_n() * 100);
+
+	benchmp(float_initialize, do_float_bogomflops, cleanup, 
+		0, parallel, warmup, repetitions, &state);
+	nano("float bogomflops", get_n() * state.M);
+	fflush(stdout); fflush(stderr);
+
+	benchmp(double_initialize, do_double_bogomflops, cleanup, 
+		0, parallel, warmup, repetitions, &state);
+	nano("double bogomflops", get_n() * state.M);
+	fflush(stdout); fflush(stderr);
+
+	return(0);
+}
+
diff --git a/performance/lmbench3/src/lat_pagefault.c b/performance/lmbench3/src/lat_pagefault.c
new file mode 100644
index 0000000..02af9f4
--- /dev/null
+++ b/performance/lmbench3/src/lat_pagefault.c
@@ -0,0 +1,202 @@
+/*
+ * lat_pagefault.c - time a page fault in
+ *
+ * Usage: lat_pagefault [-C] [-P <parallel>] [-W <warmup>] [-N <repetitions>] file 
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+#define	CHK(x)	if ((x) == -1) { perror("x"); exit(1); }
+
+typedef struct _state {
+	int fd;
+	int size;
+	int npages;
+	int clone;
+	char* file;
+	char* where;
+	size_t* pages;
+} state_t;
+
+void	initialize(iter_t iterations, void *cookie);
+void	cleanup(iter_t iterations, void *cookie);
+void	benchmark(iter_t iterations, void * cookie);
+void	benchmark_mmap(iter_t iterations, void * cookie);
+
+int
+main(int ac, char **av)
+{
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	double t_mmap;
+	double t_combined;
+	struct stat   st;
+	struct _state state;
+	char buf[2048];
+	char* usage = "[-C] [-P <parallel>] [-W <warmup>] [-N <repetitions>] file\n";
+
+	state.clone = 0;
+
+	while (( c = getopt(ac, av, "P:W:N:C")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		case 'C':
+			state.clone = 1;
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind != ac - 1 ) {
+		lmbench_usage(ac, av, usage);
+	}
+	
+	state.file = av[optind];
+	CHK(stat(state.file, &st));
+	state.npages = st.st_size / (size_t)getpagesize();
+
+#ifdef	MS_INVALIDATE
+	benchmp(initialize, benchmark_mmap, cleanup, 0, parallel, 
+		warmup, repetitions, &state);
+	t_mmap = gettime() / (double)get_n();
+
+	benchmp(initialize, benchmark, cleanup, 0, parallel, 
+		warmup, repetitions, &state);
+	t_combined = gettime() / (double)get_n();
+	settime(get_n() * (t_combined - t_mmap));
+
+	sprintf(buf, "Pagefaults on %s", state.file);
+	micro(buf, state.npages * get_n());
+#endif
+	return(0);
+}
+
+void
+initialize(iter_t iterations, void* cookie)
+{
+	int 		i, npages, pagesize;
+	int		*p;
+	unsigned int	r;
+	struct stat 	sbuf;
+	state_t 	*state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	if (state->clone) {
+		char buf[128];
+		char* s;
+
+		/* copy original file into a process-specific one */
+		sprintf(buf, "%d", (int)getpid());
+		s = (char*)malloc(strlen(state->file) + strlen(buf) + 1);
+		sprintf(s, "%s%d", state->file, (int)getpid());
+		if (cp(state->file, s, S_IREAD|S_IWRITE) < 0) {
+			perror("Could not copy file");
+			unlink(s);
+			exit(1);
+		}
+		state->file = s;
+	}
+	CHK(state->fd = open(state->file, 0));
+	if (state->clone) unlink(state->file);
+	CHK(fstat(state->fd, &sbuf));
+
+	srand(getpid());
+	pagesize = getpagesize();
+	state->size = sbuf.st_size;
+	state->size -= state->size % pagesize;
+	state->npages = state->size / pagesize;
+	state->pages = permutation(state->npages, pagesize);
+
+	if (state->size < 1024*1024) {
+		fprintf(stderr, "lat_pagefault: %s too small\n", state->file);
+		exit(1);
+	}
+	state->where = mmap(0, state->size, 
+			    PROT_READ, MAP_SHARED, state->fd, 0);
+
+#ifdef	MS_INVALIDATE
+	if (msync(state->where, state->size, MS_INVALIDATE) != 0) {
+		perror("msync");
+		exit(1);
+	}
+#endif
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{	
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	munmap(state->where, state->size);
+	if (state->fd >= 0) close(state->fd);
+	free(state->pages);
+}
+
+void
+benchmark(iter_t iterations, void* cookie)
+{
+	int	i;
+	int	sum = 0;
+	state_t *state = (state_t *) cookie;
+
+	while (iterations-- > 0) {
+		for (i = 0; i < state->npages; ++i) {
+			sum += *(state->where + state->pages[i]);
+		}
+		munmap(state->where, state->size);
+		state->where = mmap(0, state->size, 
+				    PROT_READ, MAP_SHARED, state->fd, 0);
+#ifdef	MS_INVALIDATE
+		if (msync(state->where, state->size, MS_INVALIDATE) != 0) {
+			perror("msync");
+			exit(1);
+		}
+#endif
+	}
+	use_int(sum);
+}
+
+void
+benchmark_mmap(iter_t iterations, void* cookie)
+{
+	int	i;
+	int	sum = 0;
+	state_t *state = (state_t *) cookie;
+
+	while (iterations-- > 0) {
+		munmap(state->where, state->size);
+		state->where = mmap(0, state->size, 
+				    PROT_READ, MAP_SHARED, state->fd, 0);
+#ifdef	MS_INVALIDATE
+		if (msync(state->where, state->size, MS_INVALIDATE) != 0) {
+			perror("msync");
+			exit(1);
+		}
+#endif
+	}
+	use_int(sum);
+}
+
diff --git a/performance/lmbench3/src/lat_pipe.c b/performance/lmbench3/src/lat_pipe.c
new file mode 100644
index 0000000..bdf2a79
--- /dev/null
+++ b/performance/lmbench3/src/lat_pipe.c
@@ -0,0 +1,155 @@
+/*
+ * lat_pipe.c - pipe transaction test
+ *
+ * usage: lat_pipe [-P <parallelism>] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+void initialize(iter_t iterations, void *cookie);
+void cleanup(iter_t iterations, void *cookie);
+void doit(iter_t iterations, void *cookie);
+void writer(int w, int r);
+
+typedef struct _state {
+	int	pid;
+	int	p1[2];
+	int	p2[2];
+} state_t;
+
+int 
+main(int ac, char **av)
+{
+	state_t state;
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n";
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind < ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.pid = 0;
+
+	benchmp(initialize, doit, cleanup, SHORT, parallel, 
+		warmup, repetitions, &state);
+	micro("Pipe latency", get_n());
+	return (0);
+}
+
+void 
+initialize(iter_t iterations, void* cookie)
+{
+	char	c;
+	state_t * state = (state_t *)cookie;
+
+	if (iterations) return;
+
+	if (pipe(state->p1) == -1) {
+		perror("pipe");
+		exit(1);
+	}
+	if (pipe(state->p2) == -1) {
+		perror("pipe");
+		exit(1);
+	}
+	handle_scheduler(benchmp_childid(), 0, 1);
+	switch (state->pid = fork()) {
+	    case 0:
+		handle_scheduler(benchmp_childid(), 1, 1);
+		signal(SIGTERM, exit);
+		close(state->p1[1]);
+		close(state->p2[0]);
+		writer(state->p2[1], state->p1[0]);
+		return;
+
+	    case -1:
+		perror("fork");
+		return;
+
+	    default:
+		close(state->p1[0]);
+		close(state->p2[1]);
+		break;
+	}
+
+	/*
+	 * One time around to make sure both processes are started.
+	 */
+	if (write(state->p1[1], &c, 1) != 1 || read(state->p2[0], &c, 1) != 1){
+		perror("(i) read/write on pipe");
+		exit(1);
+	}
+}
+
+void 
+cleanup(iter_t iterations, void* cookie)
+{
+	state_t * state = (state_t *)cookie;
+
+	if (iterations) return;
+
+	if (state->pid) {
+		kill(state->pid, SIGKILL);
+		waitpid(state->pid, NULL, 0);
+		state->pid = 0;
+	}
+}
+
+void 
+doit(register iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+	char		c;
+	register int	w = state->p1[1];
+	register int	r = state->p2[0];
+	register char	*cptr = &c;
+
+	while (iterations-- > 0) {
+		if (write(w, cptr, 1) != 1 ||
+		    read(r, cptr, 1) != 1) {
+			perror("(r) read/write on pipe");
+			exit(1);
+		}
+	}
+}
+
+void 
+writer(register int w, register int r)
+{
+	char		c;
+	register char	*cptr = &c;
+
+	for ( ;; ) {
+		if (read(r, cptr, 1) != 1 ||
+			write(w, cptr, 1) != 1) {
+			    perror("(w) read/write on pipe");
+		}
+	}
+}
diff --git a/performance/lmbench3/src/lat_pmake.c b/performance/lmbench3/src/lat_pmake.c
new file mode 100644
index 0000000..8d898eb
--- /dev/null
+++ b/performance/lmbench3/src/lat_pmake.c
@@ -0,0 +1,158 @@
+/*
+ * lat_pmake.c - time to complete N jobs which each do usecs worth of work
+ *
+ * usage: lat_pipe [-P <parallelism>] [-W <warmup>] [-N <repetitions>] jobs usecs
+ *
+ * Copyright (c) 1994 Larry McVoy.  
+ * Copyright (c) 2002 Carl Staelin. Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+void setup(iter_t iterations, void* cookie);
+void bench(iter_t iterations, void *cookie);
+void cleanup(iter_t iterations, void *cookie);
+void work(iter_t iterations, void *cookie);
+
+typedef struct _state {
+	int	jobs;		/* number of jobs to create */
+	iter_t	iterations;	/* how long each job should work */
+	long*	x;		/* used by work() */
+	long**	p;
+	pid_t*	pids;
+} state_t;
+
+int 
+main(int ac, char **av)
+{
+	state_t state;
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	double time;
+	uint64	usecs;
+	char buf[1024];
+	char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] Njobs usecs...\n";
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (ac < optind + 2) {
+		lmbench_usage(ac, av, usage);
+	}
+	state.jobs = atoi(av[optind]);
+	state.pids = NULL;
+	fprintf(stderr, "\"pmake jobs=%d\n", state.jobs);
+	while (++optind < ac) {
+		usecs = bytes(av[optind]);
+		benchmp(setup, work, NULL, 0, 1, 0, TRIES, &state);
+		if (gettime() == 0) exit(1);
+		state.iterations = (iter_t)((usecs * get_n()) / gettime());
+
+		benchmp(setup, bench, NULL, 0, parallel, 
+			warmup, repetitions, &state);
+		time = gettime();
+		time /= get_n();
+		if (time > 0.0)
+			fprintf(stderr, "%llu %.2f\n", usecs, time);
+	}
+	return (0);
+}
+
+void
+setup(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	state->x = (long*)malloc(sizeof(long*));
+	*(long**)state->x = state->x;
+	state->p = (long**)state->x;
+
+	handle_scheduler(benchmp_childid(), 0, state->jobs);
+}
+
+void 
+bench(register iter_t iterations, void *cookie)
+{
+	int	i;
+	int	status;
+	state_t *state = (state_t *) cookie;
+	
+	state->pids = (pid_t*)malloc(state->jobs * sizeof(pid_t));
+
+	/* 
+	 * This design has one buglet --- we cannot detect if the 
+	 * worker process died prematurely.  I.e., we don't have
+	 * a handshake step to collect "I finished correctly"
+	 * messages.
+	 */
+	while (iterations-- > 0) {
+		for (i = 0; i < state->jobs; ++i) {
+			if ((state->pids[i] = fork()) == 0) {
+				handle_scheduler(benchmp_childid(), i+1, state->jobs);
+				work(state->iterations, state);
+				exit(0);
+			}
+		}
+		for (i = 0; i < state->jobs; ++i) {
+			waitpid(state->pids[i], &status, 0);
+			state->pids[i] = -1;
+
+			/* child died badly */
+			if (!WIFEXITED(status)) {
+				cleanup(0, cookie);
+				exit(1);
+			}
+		}
+	}
+}
+
+void 
+cleanup(register iter_t iterations, void *cookie)
+{
+	int	i;
+	state_t *state = (state_t *) cookie;
+
+	for (i = 0; i < state->jobs; ++i) {
+		if (state->pids[i] > 0) {
+			kill(state->pids[i], SIGKILL);
+			waitpid(state->pids[i], NULL, 0);
+			state->pids[i] = -1;
+		}
+	}
+}
+
+void
+work(register iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+	register long** p = state->p;
+
+#define	WORK_TEN(one)	one one one one one one one one one one
+	while (iterations-- > 0) {
+		WORK_TEN(p = (long**) *p;);
+	}
+	state->p = p;
+}
diff --git a/performance/lmbench3/src/lat_proc.c b/performance/lmbench3/src/lat_proc.c
new file mode 100644
index 0000000..e36e19d
--- /dev/null
+++ b/performance/lmbench3/src/lat_proc.c
@@ -0,0 +1,182 @@
+/*
+ * lat_proc.c - process creation tests
+ *
+ * Usage: lat_proc [-P <parallelism] [-W <warmup>] [-N <repetitions>] procedure|fork|exec|shell
+ *
+ * TODO - linux clone, plan9 rfork, IRIX sproc().
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+
+#ifdef STATIC
+#define	PROG "/tmp/hello-s"
+#define STATIC_PREFIX "Static "
+#else
+#define	PROG "/tmp/hello"
+#define STATIC_PREFIX ""
+#endif
+
+void do_shell(iter_t iterations, void* cookie);
+void do_forkexec(iter_t iterations,void* cookie);
+void do_fork(iter_t iterations, void* cookie);
+void do_procedure(iter_t iterations, void* cookie);
+
+pid_t child_pid;
+
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	if (iterations) return;
+
+	if (child_pid) {
+		kill(child_pid, SIGKILL);
+		waitpid(child_pid, NULL, 0);
+		child_pid = 0;
+	}
+}
+	
+int
+main(int ac, char **av)
+{
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] procedure|fork|exec|shell\n";
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind + 1 != ac) { /* should have one argument left */
+		lmbench_usage(ac, av, usage);
+	}
+
+	if (!strcmp("procedure", av[optind])) {
+		benchmp(NULL, do_procedure, cleanup, 0, parallel, 
+			warmup, repetitions, &ac);
+		micro("Procedure call", get_n());
+	} else if (!strcmp("fork", av[optind])) {
+		benchmp(NULL, do_fork, cleanup, 0, parallel, 
+			warmup, repetitions, NULL);
+		micro(STATIC_PREFIX "Process fork+exit", get_n());
+	} else if (!strcmp("exec", av[optind])) {
+		benchmp(NULL, do_forkexec, cleanup, 0, parallel,
+			warmup, repetitions, NULL);
+		micro(STATIC_PREFIX "Process fork+execve", get_n());
+	} else if (!strcmp("shell", av[optind])) {
+		benchmp(NULL, do_shell, cleanup, 0, parallel,
+			warmup, repetitions, NULL);
+		micro(STATIC_PREFIX "Process fork+/bin/sh -c", get_n());
+	} else {
+		lmbench_usage(ac, av, usage);
+	}
+	return(0);
+}
+
+void 
+do_shell(iter_t iterations, void* cookie)
+{
+	signal(SIGCHLD, SIG_DFL);
+	handle_scheduler(benchmp_childid(), 0, 1);
+	while (iterations-- > 0) {
+		switch (child_pid = fork()) {
+		case -1:
+			perror("fork");
+			exit(1);
+	    
+		case 0:	/* child */
+			handle_scheduler(benchmp_childid(), 1, 1);
+			close(1);
+			execlp("/bin/sh", "sh", "-c", PROG, 0);
+			exit(1);
+
+		default:
+			waitpid(child_pid, NULL,0);
+		}
+		child_pid = 0;
+	}
+}
+
+void 
+do_forkexec(iter_t iterations, void* cookie)
+{
+	char	*nav[2];
+
+	signal(SIGCHLD, SIG_DFL);
+	handle_scheduler(benchmp_childid(), 0, 1);
+	while (iterations-- > 0) {
+		nav[0] = PROG;
+		nav[1] = 0;
+		switch (child_pid = fork()) {
+		case -1:
+			perror("fork");
+			exit(1);
+
+		case 0: 	/* child */
+			handle_scheduler(benchmp_childid(), 1, 1);
+			close(1);
+			execve(PROG, nav, 0);
+			exit(1);
+
+		default:
+			waitpid(child_pid, NULL,0);
+		}
+		child_pid = 0;
+	}
+}
+	
+void 
+do_fork(iter_t iterations, void* cookie)
+{
+	signal(SIGCHLD, SIG_DFL);
+	handle_scheduler(benchmp_childid(), 0, 1);
+	while (iterations-- > 0) {
+		switch (child_pid = fork()) {
+		case -1:
+			perror("fork");
+			exit(1);
+	
+		case 0:	/* child */
+			handle_scheduler(benchmp_childid(), 1, 1);
+			exit(1);
+	
+		default:
+			waitpid(child_pid, NULL,0);
+		}
+		child_pid = 0;
+	}
+}
+	
+void 
+do_procedure(iter_t iterations, void* cookie)
+{
+	int r = *(int *) cookie;
+	handle_scheduler(benchmp_childid(), 0, 1);
+	while (iterations-- > 0) {
+		use_int(r);
+	}
+}
diff --git a/performance/lmbench3/src/lat_rand.c b/performance/lmbench3/src/lat_rand.c
new file mode 100644
index 0000000..42b3aaf
--- /dev/null
+++ b/performance/lmbench3/src/lat_rand.c
@@ -0,0 +1,120 @@
+/*
+ * lat_rand.c - random number generation
+ *
+ * usage: lat_rand [-P <parallelism>] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 2002 Carl Staelin.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Hewlett-Packard is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+#ifdef HAVE_DRAND48
+void bench_drand48(iter_t iterations, void *cookie);
+void bench_lrand48(iter_t iterations, void *cookie);
+#endif
+#ifdef HAVE_RAND
+void bench_rand(iter_t iterations, void *cookie);
+#endif
+#ifdef HAVE_RANDOM
+void bench_random(iter_t iterations, void *cookie);
+#endif
+int 
+main(int ac, char **av)
+{
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n";
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind < ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+#ifdef HAVE_DRAND48
+	benchmp(NULL, bench_drand48, NULL,
+		0, parallel, warmup, repetitions, NULL);
+	nano("drand48 latency", get_n());
+
+	benchmp(NULL, bench_lrand48, NULL,
+		0, parallel, warmup, repetitions, NULL);
+	nano("lrand48 latency", get_n());
+#endif
+#ifdef HAVE_RAND
+	benchmp(NULL, bench_rand, NULL,
+		0, parallel, warmup, repetitions, NULL);
+	nano("rand latency", get_n());
+#endif
+#ifdef HAVE_RANDOM
+	benchmp(NULL, bench_random, NULL,
+		0, parallel, warmup, repetitions, NULL);
+	nano("random latency", get_n());
+#endif
+	return (0);
+}
+
+#ifdef HAVE_DRAND48
+void 
+bench_drand48(register iter_t iterations, void *cookie)
+{
+	register double v = 0.0;
+	while (iterations-- > 0) {
+		v += drand48();
+	}
+	use_int((int)v);
+}
+
+void 
+bench_lrand48(register iter_t iterations, void *cookie)
+{
+	register long v = 0.0;
+	while (iterations-- > 0) {
+		v += lrand48();
+	}
+	use_int((int)v);
+}
+#endif /* HAVE_DRAND48 */
+#ifdef HAVE_RAND
+void 
+bench_rand(register iter_t iterations, void *cookie)
+{
+	register int v = 0.0;
+	while (iterations-- > 0) {
+		v += rand();
+	}
+	use_int((int)v);
+}
+#endif /* HAVE_RAND */
+#ifdef HAVE_RANDOM
+void 
+bench_random(register iter_t iterations, void *cookie)
+{
+	register int v = 0.0;
+	while (iterations-- > 0) {
+		v += random();
+	}
+	use_int((int)v);
+}
+#endif /* HAVE_RANDOM */
diff --git a/performance/lmbench3/src/lat_rpc.c b/performance/lmbench3/src/lat_rpc.c
new file mode 100644
index 0000000..3ebfb16
--- /dev/null
+++ b/performance/lmbench3/src/lat_rpc.c
@@ -0,0 +1,285 @@
+/*
+ * lat_rpc.c - simple RPC transaction latency test
+ *
+ * Four programs in one -
+ *	server usage:	lat_rpc -s
+ *	client usage:	lat_rpc hostname
+ *	client usage:	lat_rpc -p tcp hostname
+ *	client usage:	lat_rpc -p udp hostname
+ *	shutdown:	lat_rpc -S hostname
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+#include "bench.h"
+
+void	client_main(int ac, char **av);
+void	server_main();
+void	benchmark(iter_t iterations, void* _state);
+char	*client_rpc_xact_1(char *argp, CLIENT *clnt);
+
+void
+doit(CLIENT *cl, char *server)
+{
+	char	c = 1;
+	char	*resp;
+	
+	resp = client_rpc_xact_1(&c, cl);
+	if (!resp) {
+		clnt_perror(cl, server);
+		exit(1);
+	}
+	if (*resp != 123) {
+		fprintf(stderr, "lat_rpc: got bad data\n");
+		exit(1);
+	}
+}
+
+
+/* Default timeout can be changed using clnt_control() */
+static struct timeval TIMEOUT = { 0, 25000 };
+
+char	*proto[] = { "tcp", "udp", 0 };
+
+typedef struct state_ {
+	int	msize;
+	char	*server;
+	char	*protocol;
+	CLIENT	*cl;
+} state_t;
+
+void
+initialize(iter_t iterations, void* cookie)
+{
+	struct	timeval tv;
+	state_t *state = (state_t*)cookie;
+
+	if (iterations) return;
+
+	state->cl = clnt_create(state->server, XACT_PROG, XACT_VERS, 
+				state->protocol);
+	if (!state->cl) {
+		clnt_pcreateerror(state->server);
+		exit(1);
+	}
+	if (strcasecmp(state->protocol, proto[1]) == 0) {
+		tv.tv_sec = 0;
+		tv.tv_usec = 2500;
+		if (!clnt_control(state->cl, CLSET_RETRY_TIMEOUT, (char *)&tv)) {
+			clnt_perror(state->cl, "setting timeout");
+			exit(1);
+		}
+	}
+}
+
+void
+benchmark(iter_t iterations, void* _state)
+{
+	state_t* state = (state_t*)_state;
+	char	buf[256];
+
+	while (iterations-- > 0) {
+		doit(state->cl, state->server);
+	}
+}
+
+int
+main(int ac, char **av)
+{
+	int	i;
+	int 	c;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int	server = 0;
+	int	shutdown = 0;
+	state_t	state;
+	CLIENT	*cl;
+	char	buf[1024];
+	char	*protocol = NULL;
+	char	*usage = "-s\n OR [-p <tcp|udp>] [-P parallel] [-W <warmup>] [-N <repetitions>] serverhost\n OR -S serverhost\n";
+
+	state.msize = 1;
+
+	while (( c = getopt(ac, av, "sS:m:p:P:W:N:")) != EOF) {
+		switch(c) {
+		case 's': /* Server */
+			if (fork() == 0) {
+				server_main();
+			}
+			exit(0);
+		case 'S': /* shutdown serverhost */
+		{
+			cl = clnt_create(optarg, XACT_PROG, XACT_VERS, "udp");
+			if (!cl) {
+				clnt_pcreateerror(state.server);
+				exit(1);
+			}
+			clnt_call(cl, RPC_EXIT, (xdrproc_t)xdr_void, 0, 
+				  (xdrproc_t)xdr_void, 0, TIMEOUT);
+			exit(0);
+		}
+		case 'm':
+			state.msize = atoi(optarg);
+			break;
+		case 'p':
+			protocol = optarg;
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0)
+				lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind != ac - 1) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.server = av[optind++];
+
+	if (protocol == NULL || !strcasecmp(protocol, proto[0])) {
+		state.protocol = proto[0];
+		benchmp(initialize, benchmark, NULL, MEDIUM, parallel, 
+			warmup, repetitions, &state);
+		sprintf(buf, "RPC/%s latency using %s", proto[0], state.server);
+		micro(buf, get_n());
+	}
+
+	if (protocol == NULL || !strcasecmp(protocol, proto[1])) {
+		state.protocol = proto[1];
+		benchmp(initialize, benchmark, NULL, MEDIUM, parallel, 
+			warmup, repetitions, &state);
+		sprintf(buf, "RPC/%s latency using %s", proto[1], state.server);
+		micro(buf, get_n());
+	}
+		
+	exit(0);
+}
+
+char *
+client_rpc_xact_1(char *argp, CLIENT *clnt)
+{
+	static char res;
+
+	bzero((void*)&res, sizeof(res));
+	if (clnt_call(clnt, RPC_XACT, (xdrproc_t)xdr_char,
+	    argp, (xdrproc_t)xdr_char, &res, TIMEOUT) != RPC_SUCCESS) {
+		return (NULL);
+	}
+	return (&res);
+}
+
+/*
+ * The remote procedure[s] that will be called
+ */
+/* ARGSUSED */
+char	*
+rpc_xact_1(msg, transp)
+     	char	*msg;
+	register SVCXPRT *transp;
+{
+	static char r = 123;
+
+	return &r;
+}
+
+static void xact_prog_1();
+
+void
+server_main()
+{
+	register SVCXPRT *transp;
+
+	GO_AWAY;
+
+	(void) pmap_unset(XACT_PROG, XACT_VERS);
+
+	transp = svcudp_create(RPC_ANYSOCK);
+	if (transp == NULL) {
+		fprintf(stderr, "cannot create udp service.\n");
+		exit(1);
+	}
+	if (!svc_register(transp, XACT_PROG, XACT_VERS, xact_prog_1, IPPROTO_UDP)) {
+		fprintf(stderr, "unable to register (XACT_PROG, XACT_VERS, udp).\n");
+		exit(1);
+	}
+
+	transp = svctcp_create(RPC_ANYSOCK, 0, 0);
+	if (transp == NULL) {
+		fprintf(stderr, "cannot create tcp service.\n");
+		exit(1);
+	}
+	if (!svc_register(transp, XACT_PROG, XACT_VERS, xact_prog_1, IPPROTO_TCP)) {
+		fprintf(stderr, "unable to register (XACT_PROG, XACT_VERS, tcp).\n");
+		exit(1);
+	}
+
+	svc_run();
+	fprintf(stderr, "svc_run returned\n");
+	exit(1);
+	/* NOTREACHED */
+}
+
+static void
+xact_prog_1(rqstp, transp)
+	struct svc_req *rqstp;
+	register SVCXPRT *transp;
+{
+	union {
+		char rpc_xact_1_arg;
+	} argument;
+	char *result;
+	bool_t (*xdr_argument)(), (*xdr_result)();
+	char *(*local)();
+
+	switch (rqstp->rq_proc) {
+	case NULLPROC:
+		(void) svc_sendreply(transp, (xdrproc_t)xdr_void, (char *)NULL);
+		return;
+
+	case RPC_XACT:
+		xdr_argument = xdr_char;
+		xdr_result = xdr_char;
+		local = (char *(*)()) rpc_xact_1;
+		break;
+
+	case RPC_EXIT:
+		(void) svc_sendreply(transp, (xdrproc_t)xdr_void, (char *)NULL);
+		(void) pmap_unset(XACT_PROG, XACT_VERS);
+		exit(0);
+
+	default:
+		svcerr_noproc(transp);
+		return;
+	}
+	bzero((char *)&argument, sizeof(argument));
+	if (!svc_getargs(transp, (xdrproc_t)xdr_argument, (char*)&argument)) {
+		svcerr_decode(transp);
+		return;
+	}
+	result = (*local)(&argument, rqstp);
+	if (result != NULL && !svc_sendreply(transp, (xdrproc_t)xdr_result, result)) {
+		svcerr_systemerr(transp);
+	}
+	if (!svc_freeargs(transp, (xdrproc_t)xdr_argument, (char*)&argument)) {
+		fprintf(stderr, "unable to free arguments\n");
+		exit(1);
+	}
+	return;
+}
diff --git a/performance/lmbench3/src/lat_select.c b/performance/lmbench3/src/lat_select.c
new file mode 100644
index 0000000..583b505
--- /dev/null
+++ b/performance/lmbench3/src/lat_select.c
@@ -0,0 +1,223 @@
+/*
+ * lat_select.c - time select system call
+ *
+ * usage: lat_select [-P <parallelism>] [-W <warmup>] [-N <repetitions>] [n]
+ *
+ * Copyright (c) 1996 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+void initialize(iter_t iterations, void *cookie);
+void cleanup(iter_t iterations, void *cookie);
+void doit(iter_t iterations, void *cookie);
+void writer(int w, int r);
+void server(void* cookie);
+
+typedef int (*open_f)(void* cookie);
+int  open_file(void* cookie);
+int  open_socket(void* cookie);
+
+typedef struct _state {
+	char	fname[L_tmpnam];
+	open_f	fid_f;
+	pid_t	pid;
+	int	sock;
+	int	fid;
+	int	num;
+	int	max;
+	fd_set  set;
+} state_t;
+
+int
+main(int ac, char **av)
+{
+	state_t state;
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	char* usage = "[-n <#descriptors>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] file|tcp\n";
+	char	buf[256];
+
+	morefds();  /* bump fd_cur to fd_max */
+	state.num = 200;
+	while (( c = getopt(ac, av, "P:W:N:n:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		case 'n':
+			state.num = bytes(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind + 1 != ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	if (streq("tcp", av[optind])) {
+		state.fid_f = open_socket;
+		server(&state);
+		benchmp(initialize, doit, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+		sprintf(buf, "Select on %d tcp fd's", state.num);
+		kill(state.pid, SIGKILL);
+		waitpid(state.pid, NULL, 0);
+		micro(buf, get_n());
+	} else if (streq("file", av[optind])) {
+		state.fid_f = open_file;
+		server(&state);
+		benchmp(initialize, doit, cleanup, 0, parallel, 
+			warmup, repetitions, &state);
+		unlink(state.fname);
+		sprintf(buf, "Select on %d fd's", state.num);
+		micro(buf, get_n());
+	} else {
+		lmbench_usage(ac, av, usage);
+	}
+
+	exit(0);
+}
+
+void
+server(void* cookie)
+{
+	int pid;
+	state_t* state = (state_t*)cookie;
+
+	pid = getpid();
+	state->pid = 0;
+
+	if (state->fid_f == open_file) {
+		/* Create a temporary file for clients to open */
+		sprintf(state->fname, "lat_selectXXXXXX");
+		state->fid = mkstemp(state->fname);
+		if (state->fid <= 0) {
+			char buf[L_tmpnam+128];
+			sprintf(buf, "lat_select: Could not create temp file %s", state->fname);
+			perror(buf);
+			exit(1);
+		}
+		close(state->fid);
+		return;
+	}
+
+	/* Create a socket for clients to connect to */
+	state->sock = tcp_server(TCP_SELECT, SOCKOPT_REUSE);
+	if (state->sock <= 0) {
+		perror("lat_select: Could not open tcp server socket");
+		exit(1);
+	}
+
+	/* Start a server process to accept client connections */
+	switch(state->pid = fork()) {
+	case 0:
+		/* child server process */
+		while (pid == getppid()) {
+			int newsock = tcp_accept(state->sock, SOCKOPT_NONE);
+			read(newsock, &state->fid, 1);
+			close(newsock);
+		}
+		exit(0);
+	case -1:
+		/* error */
+		perror("lat_select::server(): fork() failed");
+		exit(1);
+	default:
+		break;
+	}
+}
+
+int
+open_socket(void* cookie)
+{
+	return tcp_connect("localhost", TCP_SELECT, SOCKOPT_NONE);
+}
+
+int
+open_file(void* cookie)
+{
+	state_t* state = (state_t*)cookie;
+
+	return open(state->fname, O_RDONLY);
+}
+
+void
+doit(iter_t iterations, void * cookie)
+{
+	state_t * 	state = (state_t *)cookie;
+	fd_set		nosave;
+	static struct timeval tv;
+	static count = 0;
+	
+	tv.tv_sec = 0;
+	tv.tv_usec = 0;
+
+	while (iterations-- > 0) {
+		nosave = state->set;
+		select(state->num, 0, &nosave, 0, &tv);
+	}
+}
+
+void
+initialize(iter_t iterations, void *cookie)
+{
+	char	c;
+	state_t * state = (state_t *)cookie;
+	int	n, last = 0 /* lint */;
+	int	N = state->num, fid, fd;
+
+	if (iterations) return;
+
+	fid = (*state->fid_f)(cookie);
+	if (fid <= 0) {
+		perror("Could not open device");
+		exit(1);
+	}
+	state->max = 0;
+	FD_ZERO(&(state->set));
+	for (n = 0; n < N; n++) {
+		fd = dup(fid);
+		if (fd == -1) break;
+		if (fd > state->max)
+			state->max = fd;
+		FD_SET(fd, &(state->set));
+	}
+	state->max++;
+	close(fid);
+	if (n != N)
+		exit(1);
+}
+
+void
+cleanup(iter_t iterations, void *cookie)
+{
+	int	i;
+	state_t * state = (state_t *)cookie;
+
+	if (iterations) return;
+
+	for (i = 0; i <= state->max; ++i) {
+		if (FD_ISSET(i, &(state->set)))
+			close(i);
+	}
+	FD_ZERO(&(state->set));
+}
+
+	     
diff --git a/performance/lmbench3/src/lat_sem.c b/performance/lmbench3/src/lat_sem.c
new file mode 100644
index 0000000..fac0d81
--- /dev/null
+++ b/performance/lmbench3/src/lat_sem.c
@@ -0,0 +1,162 @@
+/*
+ * lat_sem.c - semaphore test
+ *
+ * usage: lat_sem [-P <parallelism>] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+#include <sys/sem.h>
+
+void initialize(iter_t iterations, void *cookie);
+void cleanup(iter_t iterations, void *cookie);
+void doit(iter_t iterations, void *cookie);
+void writer(int sid);
+
+typedef struct _state {
+	int	pid;
+	int	semid;
+} state_t;
+
+int 
+main(int ac, char **av)
+{
+	state_t state;
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n";
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind < ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.pid = 0;
+
+	benchmp(initialize, doit, cleanup, SHORT, parallel, 
+		warmup, repetitions, &state);
+	micro("Semaphore latency", get_n() * 2);
+	return (0);
+}
+
+void 
+initialize(iter_t iterations, void* cookie)
+{
+	char	c;
+	state_t * state = (state_t *)cookie;
+
+	if (iterations) return;
+
+	state->semid = semget(IPC_PRIVATE, 2, IPC_CREAT | IPC_EXCL | 0600);
+	semctl(state->semid, 0, SETVAL, 0);
+	semctl(state->semid, 1, SETVAL, 0);
+
+	handle_scheduler(benchmp_childid(), 0, 1);
+	switch (state->pid = fork()) {
+	    case 0:
+		signal(SIGTERM, exit);
+		handle_scheduler(benchmp_childid(), 1, 1);
+		writer(state->semid);
+		return;
+
+	    case -1:
+		perror("fork");
+		return;
+
+	    default:
+		break;
+	}
+}
+
+void 
+cleanup(iter_t iterations, void* cookie)
+{
+	state_t * state = (state_t *)cookie;
+
+	if (iterations) return;
+
+	if (state->pid) {
+		kill(state->pid, SIGKILL);
+		waitpid(state->pid, NULL, 0);
+		state->pid = 0;
+	}
+	/* free the semaphores */
+	semctl(state->semid, 0, IPC_RMID);
+}
+
+void 
+doit(register iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+	struct sembuf sop[2];
+
+	sop[0].sem_num = 1;
+	sop[0].sem_op = -1;
+	sop[0].sem_flg = 0;
+
+	sop[1].sem_num = 0;
+	sop[1].sem_op = 1;
+	sop[1].sem_flg = 0;
+
+	while (iterations-- > 0) {
+		if (semop(state->semid, sop, 2) < 0) {
+			perror("(r) error on semaphore");
+			exit(1);
+		}
+	}
+}
+
+void 
+writer(register int sid)
+{
+	struct sembuf sop[2];
+
+	sop[0].sem_num = 1;
+	sop[0].sem_op = 1;
+	sop[0].sem_flg = 0;
+
+	if (semop(sid, sop, 1) < 0) {
+		perror("(w) error on initial semaphore");
+		exit(1);
+	}
+
+	sop[0].sem_num = 0;
+	sop[0].sem_op = -1;
+	sop[0].sem_flg = 0;
+
+	sop[1].sem_num = 1;
+	sop[1].sem_op = 1;
+	sop[1].sem_flg = 0;
+
+	for ( ;; ) {
+		if (semop(sid, sop, 2) < 0) {
+			perror("(w) error on semaphore");
+			exit(1);
+		}
+	}
+}
diff --git a/performance/lmbench3/src/lat_sig.c b/performance/lmbench3/src/lat_sig.c
new file mode 100644
index 0000000..46aef0e
--- /dev/null
+++ b/performance/lmbench3/src/lat_sig.c
@@ -0,0 +1,213 @@
+/*
+ * lat_sig.c - signal handler test
+ *
+ * XXX - this benchmark requires the POSIX sigaction interface.  The reason
+ * for that is that the signal handler stays installed with that interface.
+ * The more portable signal() interface may or may not stay installed and
+ * reinstalling it each time is expensive.
+ *
+ * XXX - should really do a two process version.
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+#include <setjmp.h>
+
+uint64	caught, n;
+double	adj;
+void	handler() { }
+jmp_buf	prot_env;
+
+void
+do_send(iter_t iterations, void* cookie)
+{
+	int	me = getpid();
+
+	while (--iterations > 0) {
+		kill(me, 0);
+	}
+}
+
+void
+do_install(iter_t iterations, void* cookie)
+{
+	struct	sigaction sa, old;
+
+	while (iterations-- > 0) {
+		sa.sa_handler = handler;
+		sigemptyset(&sa.sa_mask);	
+		sa.sa_flags = 0;
+		sigaction(SIGUSR1, &sa, &old);
+	}
+}
+
+void
+do_catch(iter_t iterations, void* cookie)
+{
+	int	me = getpid();
+	struct	sigaction sa, old;
+	double	u;
+
+	sa.sa_handler = handler;
+	sigemptyset(&sa.sa_mask);	
+	sa.sa_flags = 0;
+	sigaction(SIGUSR1, &sa, &old);
+
+	while (--iterations > 0) {
+		kill(me, SIGUSR1);
+	}
+}
+
+struct _state {
+	char*	fname;
+	char*	where;
+};
+
+void
+prot() {
+	if (++caught == n) {
+		caught = 0;
+		n = benchmp_interval(benchmp_getstate());
+	}
+}
+
+void
+initialize(iter_t iterations, void* cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+	int	fd;
+	struct	sigaction sa;
+
+	if (iterations) return;
+
+	fd = open(state->fname, 0);
+	state->where = mmap(0, 4096, PROT_READ, MAP_SHARED, fd, 0);
+	if ((long)state->where == -1) {
+		perror("mmap");
+		exit(1);
+	}
+
+	sa.sa_handler = prot;
+	sigemptyset(&sa.sa_mask);
+	sa.sa_flags = 0;
+	sigaction(SIGSEGV, &sa, 0);
+	sigaction(SIGBUS, &sa, 0);
+}
+
+void
+do_prot(iter_t iterations, void* cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	n = iterations;
+	caught = 0;
+
+	/* start the first timing interval */
+	start(0);
+
+	/* trigger the page fault, causing us to jump to prot() */
+	*state->where = 1;
+}
+
+/*
+ * Cost of catching the signal less the cost of sending it
+ */
+void
+bench_catch(int parallel, int warmup, int repetitions)
+{
+	uint64 t, send_usecs, send_n;
+
+	/* measure cost of sending signal */
+	benchmp(NULL, do_send, NULL, 0, parallel, 
+		warmup, repetitions, NULL);
+	send_usecs = gettime();
+	send_n = get_n();
+
+	/* measure cost of sending & catching signal */
+	benchmp(NULL, do_catch, NULL, 0, parallel, 
+		warmup, repetitions, NULL);
+
+	/* subtract cost of sending signal */
+	if (gettime() > (send_usecs * get_n()) / send_n) {
+		settime(gettime() - (send_usecs * get_n()) / send_n);
+	} else {
+		settime(0);
+	}
+}
+
+void
+bench_prot(char* fname, int parallel, int warmup, int repetitions)
+{
+	uint64 catch_usecs, catch_n;
+	struct _state state;
+
+	state.fname = fname;
+
+	/*
+	 * Catch protection faults.
+	 * Assume that they will cost the same as a normal catch.
+	 */
+	bench_catch(parallel, warmup, repetitions);
+	catch_usecs = gettime();
+	catch_n = get_n();
+
+	benchmp(initialize, do_prot, NULL, 0, parallel, 
+		warmup, repetitions, &state);
+	if (gettime() > (catch_usecs * get_n()) / catch_n) {
+		settime(gettime() - (catch_usecs * get_n()) / catch_n);
+	} else {
+		settime(0);
+	}
+}
+
+
+int
+main(int ac, char **av)
+{
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] install|catch|prot [file]\n";
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind != ac - 1 && optind != ac - 2) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	if (!strcmp("install", av[optind])) {
+		benchmp(NULL, do_install, NULL, 0, parallel, 
+			warmup, repetitions, NULL);
+		micro("Signal handler installation", get_n());
+	} else if (!strcmp("catch", av[optind])) {
+		bench_catch(parallel, warmup, repetitions);
+		micro("Signal handler overhead", get_n());
+	} else if (!strcmp("prot", av[optind]) && optind == ac - 2) {
+		bench_prot(av[optind+1], parallel, warmup, repetitions);
+		micro("Protection fault", get_n());
+	} else {
+		lmbench_usage(ac, av, usage);
+	}
+	return(0);
+}
diff --git a/performance/lmbench3/src/lat_syscall.c b/performance/lmbench3/src/lat_syscall.c
new file mode 100644
index 0000000..9f30622
--- /dev/null
+++ b/performance/lmbench3/src/lat_syscall.c
@@ -0,0 +1,175 @@
+/*
+ * lat_syscall.c - time simple system calls
+ *
+ * Copyright (c) 1996 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ */
+char	*id = "$Id: s.lat_syscall.c 1.11 97/06/15 22:38:58-07:00 lm $\n";
+
+#include "bench.h"
+#define	FNAME "/usr/include/sys/types.h"
+
+struct _state {
+	int fd;
+	char* file;
+};
+
+void
+do_getppid(iter_t iterations, void *cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	char	c;
+
+	while (iterations-- > 0) {
+		getppid();
+	}
+}
+
+void
+do_write(iter_t iterations, void *cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	char	c;
+
+	while (iterations-- > 0) {
+		if (write(pState->fd, &c, 1) != 1) {
+			perror("/dev/null");
+			return;
+		}
+	}
+}
+
+void
+do_read(iter_t iterations, void *cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	char	c;
+
+	while (iterations-- > 0) {
+		if (read(pState->fd, &c, 1) != 1) {
+			perror("/dev/zero");
+			return;
+		}
+	}
+}
+
+void
+do_stat(iter_t iterations, void *cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	struct	stat sbuf;
+
+	while (iterations-- > 0) {
+		if (stat(pState->file, &sbuf) == -1) {
+			perror(pState->file);
+			return;
+		}
+	}
+}
+
+void
+do_fstat(iter_t iterations, void *cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	struct	stat sbuf;
+
+	while (iterations-- > 0) {
+		if (fstat(pState->fd, &sbuf) == -1) {
+			perror("fstat");
+			return;
+		}
+	}
+}
+
+void
+do_openclose(iter_t iterations, void *cookie)
+{
+	struct _state *pState = (struct _state*)cookie;
+	int	fd;
+
+	while (iterations-- > 0) {
+		fd = open(pState->file, 0);
+		if (fd == -1) {
+			perror(pState->file);
+			return;
+		}
+		close(fd);
+	}
+}
+
+int
+main(int ac, char **av)
+{
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	int c;
+	struct _state state;
+	char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] null|read|write|stat|fstat|open [file]\n";
+
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind != ac - 1 && optind != ac - 2 ) {
+		lmbench_usage(ac, av, usage);
+	}
+	
+	state.file = FNAME;
+	if (optind == ac - 2) 
+		state.file = av[optind + 1];
+
+	if (!strcmp("null", av[optind])) {
+		benchmp(NULL, do_getppid, NULL, 0, parallel, 
+			warmup, repetitions, &state);
+		micro("Simple syscall", get_n());
+	} else if (!strcmp("write", av[optind])) {
+		state.fd = open("/dev/null", 1);
+		benchmp(NULL, do_write, NULL, 0, parallel, 
+			warmup, repetitions, &state);
+		micro("Simple write", get_n());
+		close(state.fd);
+	} else if (!strcmp("read", av[optind])) {
+		state.fd = open("/dev/zero", 0);
+		if (state.fd == -1) {
+			fprintf(stderr, "Simple read: -1\n");
+			return(1);
+		}
+		benchmp(NULL, do_read, NULL, 0, parallel, 
+			warmup, repetitions, &state);
+		micro("Simple read", get_n());
+		close(state.fd);
+	} else if (!strcmp("stat", av[optind])) {
+		benchmp(NULL, do_stat, NULL, 0, parallel, 
+			warmup, repetitions, &state);
+		micro("Simple stat", get_n());
+	} else if (!strcmp("fstat", av[optind])) {
+		state.fd = open(state.file, 0);
+		benchmp(NULL, do_fstat, NULL, 0, parallel, 
+			warmup, repetitions, &state);
+		micro("Simple fstat", get_n());
+		close(state.fd);
+	} else if (!strcmp("open", av[optind])) {
+		benchmp(NULL, do_openclose, NULL, 0, parallel, 
+			warmup, repetitions, &state);
+		micro("Simple open/close", get_n());
+	} else {
+		lmbench_usage(ac, av, usage);
+	}
+	return(0);
+}
diff --git a/performance/lmbench3/src/lat_tcp.c b/performance/lmbench3/src/lat_tcp.c
new file mode 100644
index 0000000..cf4d145
--- /dev/null
+++ b/performance/lmbench3/src/lat_tcp.c
@@ -0,0 +1,175 @@
+/*
+ * lat_tcp.c - simple TCP transaction latency test
+ *
+ * Three programs in one -
+ *	server usage:	tcp_xact -s
+ *	client usage:	tcp_xact [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname
+ *	shutdown:	tcp_xact -S hostname
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+typedef struct _state {
+	int	msize;
+	int	sock;
+	char	*server;
+	char	*buf;
+} state_t;
+
+void	init(iter_t iterations, void* cookie);
+void	cleanup(iter_t iterations, void* cookie);
+void	doclient(iter_t iterations, void* cookie);
+void	server_main();
+void	doserver(int sock);
+
+int
+main(int ac, char **av)
+{
+	state_t state;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int 	c;
+	char	buf[256];
+	char	*usage = "-s\n OR [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] server\n OR -S server\n";
+
+	state.msize = 1;
+
+	while (( c = getopt(ac, av, "sS:m:P:W:N:")) != EOF) {
+		switch(c) {
+		case 's': /* Server */
+			if (fork() == 0) {
+				server_main();
+			}
+			exit(0);
+		case 'S': /* shutdown serverhost */
+			state.sock = tcp_connect(optarg,
+						 TCP_XACT,
+						 SOCKOPT_NONE);
+			close(state.sock);
+			exit(0);
+		case 'm':
+			state.msize = atoi(optarg);
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0)
+				lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind != ac - 1) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.server = av[optind];
+	benchmp(init, doclient, cleanup, MEDIUM, parallel, 
+		warmup, repetitions, &state);
+
+	sprintf(buf, "TCP latency using %s", state.server);
+	micro(buf, get_n());
+
+	exit(0);
+}
+
+void
+init(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+	int	msize  = htonl(state->msize);
+
+	if (iterations) return;
+
+	state->sock = tcp_connect(state->server, TCP_XACT, SOCKOPT_NONE);
+	state->buf = malloc(state->msize);
+
+	write(state->sock, &msize, sizeof(int));
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	close(state->sock);
+	free(state->buf);
+}
+
+void
+doclient(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+	int 	sock   = state->sock;
+
+	while (iterations-- > 0) {
+		write(sock, state->buf, state->msize);
+		read(sock, state->buf, state->msize);
+	}
+}
+
+void
+server_main()
+{
+	int     newsock, sock;
+
+	GO_AWAY;
+	signal(SIGCHLD, sigchld_wait_handler);
+	sock = tcp_server(TCP_XACT, SOCKOPT_REUSE);
+	for (;;) {
+		newsock = tcp_accept(sock, SOCKOPT_NONE);
+		switch (fork()) {
+		    case -1:
+			perror("fork");
+			break;
+		    case 0:
+			doserver(newsock);
+			exit(0);
+		    default:
+			close(newsock);
+			break;
+		}
+	}
+	/* NOTREACHED */
+}
+
+void
+doserver(int sock)
+{
+	int	n;
+
+	if (read(sock, &n, sizeof(int)) == sizeof(int)) {
+		int	msize = ntohl(n);
+		char*   buf = (char*)malloc(msize);
+
+		for (n = 0; read(sock, buf, msize) > 0; n++) {
+			write(sock, buf, msize);
+		}
+		free(buf);
+	} else {
+		/*
+		 * A connection with no data means shut down.
+		 */
+		tcp_done(TCP_XACT);
+		kill(getppid(), SIGTERM);
+		exit(0);
+	}
+}
diff --git a/performance/lmbench3/src/lat_udp.c b/performance/lmbench3/src/lat_udp.c
new file mode 100644
index 0000000..cd4be23
--- /dev/null
+++ b/performance/lmbench3/src/lat_udp.c
@@ -0,0 +1,207 @@
+/*
+ * udp_xact.c - simple UDP transaction latency test
+ *
+ * Three programs in one -
+ *	server usage:	lat_udp -s
+ *	client usage:	lat_udp [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname
+ *	shutdown:	lat_udp -S hostname
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+#include "bench.h"
+
+#define MAX_MSIZE (10 * 1024 * 1024)
+
+void	client_main(int ac, char **av);
+void	server_main();
+void	timeout();
+void	init(iter_t iterations, void* cookie);
+void	cleanup(iter_t iterations, void* cookie);
+void    doit(iter_t iterations, void* cookie);
+
+typedef struct _state {
+	int	sock;
+	int	seq;
+	int	msize;
+	char	*server;
+	char	*buf;
+} state_t;
+
+
+int
+main(int ac, char **av)
+{
+	state_t state;
+	int	c;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int	server = 0;
+	int	shutdown = 0;
+	int	msize = 4;
+ 	char	buf[256];
+	char	*usage = "-s\n OR [-S] [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] server\n NOTE: message size must be >= 4\n";
+
+	if (sizeof(int) != 4) {
+		fprintf(stderr, "lat_udp: Wrong sequence size\n");
+		return(1);
+	}
+
+	while (( c = getopt(ac, av, "sS:m:P:W:N:")) != EOF) {
+		switch(c) {
+		case 's': /* Server */
+			if (fork() == 0) {
+				server_main();
+			}
+			exit(0);
+		case 'S': /* shutdown serverhost */
+		{
+			int seq, n;
+			int sock = udp_connect(optarg,
+					       UDP_XACT,
+					       SOCKOPT_NONE);
+			for (n = -1; n > -5; --n) {
+				seq = htonl(n);
+				(void) send(sock, &seq, sizeof(int), 0);
+			}
+			close(sock);
+			exit (0);
+		}
+		case 'm':
+			msize = atoi(optarg);
+			if (msize < sizeof(int)) {
+				lmbench_usage(ac, av, usage);
+				msize = 4;
+			}
+			if (msize > MAX_MSIZE) {
+				lmbench_usage(ac, av, usage);
+				msize = MAX_MSIZE;
+			}
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0)
+				lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind + 1 != ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	state.server = av[optind];
+	state.msize = msize;
+	benchmp(init, doit, cleanup, SHORT, parallel, 
+		warmup, repetitions, &state);
+	sprintf(buf, "UDP latency using %s", state.server);
+	micro(buf, get_n());
+	exit(0);
+}
+
+void
+init(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	state->sock = udp_connect(state->server, UDP_XACT, SOCKOPT_NONE);
+	state->seq = 0;
+	state->buf = (char*)malloc(state->msize);
+	
+	signal(SIGALRM, timeout);
+	alarm(15);
+}
+
+void
+doit(iter_t iterations, void *cookie)
+{
+	state_t *state = (state_t *) cookie;
+	int seq = state->seq;
+	int net = htonl(seq);
+	int sock = state->sock;
+	int ret;
+
+	alarm(15);
+	while (iterations-- > 0) {
+		*(int*)state->buf = htonl(seq++);
+		if (send(sock, state->buf, state->msize, 0) != state->msize) {
+			perror("lat_udp client: send failed");
+			exit(5);
+		}
+		if (recv(sock, state->buf, state->msize, 0) != state->msize) {
+			perror("lat_udp client: recv failed");
+			exit(5);
+		}
+	}
+	state->seq = seq;
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	state_t *state = (state_t *) cookie;
+
+	if (iterations) return;
+
+	close(state->sock);
+	free(state->buf);
+}
+
+void
+timeout()
+{
+	fprintf(stderr, "Recv timed out\n");
+	exit(1);
+}
+
+void
+server_main()
+{
+	char	*buf = (char*)valloc(MAX_MSIZE);
+	int     sock, sent, namelen, seq = 0;
+	struct sockaddr_in it;
+
+	GO_AWAY;
+
+	sock = udp_server(UDP_XACT, SOCKOPT_REUSE);
+
+	while (1) {
+		int nbytes;
+		namelen = sizeof(it);
+		if ((nbytes = recvfrom(sock, (void*)buf, MAX_MSIZE, 0, 
+		    (struct sockaddr*)&it, &namelen)) < 0) {
+			fprintf(stderr, "lat_udp server: recvfrom: got wrong size\n");
+			exit(9);
+		}
+		sent = ntohl(*(int*)buf);
+		if (sent < 0) {
+			udp_done(UDP_XACT);
+			exit(0);
+		}
+		if (sent != ++seq) {
+			seq = sent;
+		}
+		*(int*)buf = htonl(seq);
+		if (sendto(sock, (void*)buf, nbytes, 0, 
+		    (struct sockaddr*)&it, sizeof(it)) < 0) {
+			perror("lat_udp sendto");
+			exit(9);
+		}
+	}
+}
diff --git a/performance/lmbench3/src/lat_unix.c b/performance/lmbench3/src/lat_unix.c
new file mode 100644
index 0000000..1e321f8
--- /dev/null
+++ b/performance/lmbench3/src/lat_unix.c
@@ -0,0 +1,130 @@
+/*
+ * tcp_unix.c - simple UNIX socket transaction latency test
+ *
+ *	lat_unix [-P <parallelism>] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 1994-2000 Carl Staelin and Larry McVoy.  
+ * Distributed under the FSF GPL with additional restriction that 
+ * results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+#include "bench.h"
+
+struct _state {
+	int	sv[2];
+	int	pid;
+	int	msize;
+	char*	buf;
+};
+void	initialize(iter_t iterations, void* cookie);
+void	benchmark(iter_t iterations, void* cookie);
+void	cleanup(iter_t iterations, void* cookie);
+
+int
+main(int ac, char **av)
+{
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	struct _state state;
+	int c;
+	char* usage = "[-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n";
+
+	state.msize = 1;
+	state.pid = 0;
+
+	while (( c = getopt(ac, av, "m:P:W:N:")) != EOF) {
+		switch(c) {
+		case 'm':
+			state.msize = atoi(optarg);
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+	if (optind < ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	benchmp(initialize, benchmark, cleanup, 0, parallel, 
+		warmup, repetitions, &state);
+
+	micro("AF_UNIX sock stream latency", get_n());
+	return(0);
+}
+
+void
+initialize(iter_t iterations, void* cookie)
+{
+	struct _state* pState = (struct _state*)cookie;
+	void	exit();
+
+	if (iterations) return;
+
+	if (socketpair(AF_UNIX, SOCK_STREAM, 0, pState->sv) == -1) {
+		perror("socketpair");
+	}
+
+	pState->buf = malloc(pState->msize);
+	if (pState->buf == NULL) {
+		fprintf(stderr, "buffer allocation\n");
+		exit(1);
+	}
+	handle_scheduler(benchmp_childid(), 0, 1);
+
+	if (pState->pid = fork())
+		return;
+
+	handle_scheduler(benchmp_childid(), 1, 1);
+
+	/* Child sits and ping-pongs packets back to parent */
+	signal(SIGTERM, exit);
+	while (read(pState->sv[0], pState->buf, pState->msize) == pState->msize) {
+		write(pState->sv[0], pState->buf, pState->msize);
+	}
+	exit(0);
+}
+
+void
+benchmark(iter_t iterations, void* cookie)
+{
+	struct _state* pState = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		if (write(pState->sv[1], pState->buf, pState->msize) != pState->msize
+		    || read(pState->sv[1], pState->buf, pState->msize) != pState->msize) {
+			/* error handling: how do we signal failure? */
+			cleanup(0, cookie);
+			exit(0);
+		}
+	}
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	struct _state* pState = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	if (pState->pid) {
+		kill(pState->pid, SIGKILL);
+		waitpid(pState->pid, NULL, 0);
+		pState->pid = 0;
+	}
+}
+
diff --git a/performance/lmbench3/src/lat_unix_connect.c b/performance/lmbench3/src/lat_unix_connect.c
new file mode 100644
index 0000000..46e1876
--- /dev/null
+++ b/performance/lmbench3/src/lat_unix_connect.c
@@ -0,0 +1,102 @@
+/*
+ * lat_unix_connect.c - simple UNIX connection latency test
+ *
+ * Three programs in one -
+ *	server usage:	lat_connect -s
+ *	client usage:	lat_connect [-P <parallelism>] [-W <warmup>] [-N <repetitions>]
+ *	shutdown:	lat_connect -q
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+#include "bench.h"
+
+#define CONNAME "/tmp/af_unix"
+
+void server_main(void);
+
+void benchmark(iter_t iterations, void* cookie)
+{
+	while (iterations-- > 0) {
+		int	sock = unix_connect(CONNAME);
+		if (sock <= 0)
+			printf("error on iteration %lu\n",iterations);
+		close(sock);
+	}
+}
+
+int main(int ac, char **av)
+{
+	int parallel = 1;
+	int warmup = 0;
+	int repetitions = TRIES;
+	char	*usage = "-s\n OR [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n OR -S\n";
+	int	c;
+
+	/* Start the server "-s" or Shut down the server "-S" */
+	if (ac == 2) {
+		if (!strcmp(av[1], "-s")) {
+			if (fork() == 0) {
+				server_main();
+			}
+			exit(0);
+		}
+		if (!strcmp(av[1], "-S")) {
+			int sock = unix_connect(CONNAME);
+			write(sock, "0", 1);
+			close(sock);
+			exit(0);
+		}
+	}
+
+	/*
+	 * Rest is client
+	 */
+	while (( c = getopt(ac, av, "P:W:N:")) != EOF) {
+		switch(c) {
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (optind != ac) {
+		lmbench_usage(ac, av, usage);
+	}
+
+	benchmp(NULL, benchmark, NULL, 0, parallel, warmup, repetitions, NULL);
+	micro("UNIX connection cost", get_n());
+}
+
+void server_main(void)
+{
+	int     newsock, sock;
+	char	c;
+
+	GO_AWAY;
+	sock = unix_server(CONNAME);
+	for (;;) {
+		newsock = unix_accept(sock);
+		c = 0;
+		read(newsock, &c, 1);
+		if (c && c == '0') {
+			unix_done(sock, CONNAME);
+			exit(0);
+		}
+		close(newsock);
+	}
+}
diff --git a/performance/lmbench3/src/lat_usleep.c b/performance/lmbench3/src/lat_usleep.c
new file mode 100755
index 0000000..d18d0c8
--- /dev/null
+++ b/performance/lmbench3/src/lat_usleep.c
@@ -0,0 +1,259 @@
+/*
+ * lat_usleep.c - usleep duration/latency
+ *
+ * The APIs for usleep(3), nanosleep(2), select(2), pselect(2),
+ * getitimer(2) and setitimer(2) support resolutions down to 
+ * a micro-second.  However, many implementations do not support 
+ * such resolution.  Most current implementations (as of Fall 
+ * 2002) simply put the process back on the run queue and the 
+ * process may get run on the next scheduler time slice (10-20 
+ * milli-second resolution).
+ *
+ * This benchmark measures the true latency from the timer/sleep
+ * call to the resumption of program execution.  If the timers
+ * actually worked properly, then the latency would be identical
+ * to the requested duration, or a little longer, so the input
+ * and output figures would be nearly identical.  In most current
+ * implementations the value is rounded up to the next scheduler
+ * timeslice (e.g., a resolution of 20 milli-seconds, with all
+ * values rounded up).
+ *
+ * usage: lat_usleep [-u | -i] [-P <parallelism>] [-W <warmup>] \
+ *		[-N <repetitions>] usecs
+ *
+ * Copyright (c) 2002 Carl Staelin.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char           *id = "$Id$\n";
+
+#include "bench.h"
+#include <sched.h>
+
+typedef     enum {USLEEP, NANOSLEEP, SELECT, PSELECT, ITIMER} timer_e;
+
+uint64          caught,
+                n;
+struct itimerval value;
+
+typedef struct _state {
+    unsigned long usecs;
+} state_t;
+
+void
+bench_usleep(iter_t iterations, void *cookie)
+{
+    state_t        *state = (state_t*)cookie;
+
+    while (iterations-- > 0) {
+	usleep(state->usecs);
+    }
+}
+
+void
+bench_nanosleep(iter_t iterations, void *cookie)
+{
+    state_t        *state = (state_t*)cookie;
+    struct timespec req;
+    struct timespec rem;
+
+    req.tv_sec = 0;
+    req.tv_nsec = state->usecs * 1000;
+
+    while (iterations-- > 0) {
+	if (nanosleep(&req, &rem) < 0) {
+	    while (nanosleep(&rem, &rem) < 0)
+		;
+	}
+    }
+}
+
+void
+bench_select(iter_t iterations, void *cookie)
+{
+    state_t        *state = (state_t*)cookie;
+    struct timeval  tv;
+
+    while (iterations-- > 0) {
+	tv.tv_sec = 0;
+	tv.tv_usec = state->usecs;
+	select(0, NULL, NULL, NULL, &tv);
+    }
+}
+
+#ifdef _POSIX_SELECT
+void
+bench_pselect(iter_t iterations, void *cookie)
+{
+    state_t        *state = (state_t*)cookie;
+    struct timespec ts;
+
+    while (iterations-- > 0) {
+	ts.tv_sec = 0;
+	ts.tv_nsec = state->usecs * 1000;
+	pselect(0, NULL, NULL, NULL, &ts, NULL);
+    }
+}
+#endif /* _POSIX_SELECT */
+
+void
+interval()
+{
+    if (++caught == n) {
+	caught = 0;
+	n = benchmp_interval(benchmp_getstate());
+    }
+
+    setitimer(ITIMER_REAL, &value, NULL);
+}
+
+void
+initialize(iter_t iterations, void *cookie)
+{
+    state_t        *state = (state_t*)cookie;
+    struct sigaction sa;
+
+    if (iterations) return;
+
+    value.it_interval.tv_sec = 0;
+    value.it_interval.tv_usec = state->usecs;
+    value.it_value.tv_sec = 0;
+    value.it_value.tv_usec = state->usecs;
+
+    sa.sa_handler = interval;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = 0;
+    sigaction(SIGALRM, &sa, 0);
+}
+
+void
+bench_itimer(iter_t iterations, void *cookie)
+{
+    n = iterations;
+    caught = 0;
+
+    /*
+     * start the first timing interval 
+     */
+    start(0);
+
+    /*
+     * create the first timer, causing us to jump to interval() 
+     */
+    setitimer(ITIMER_REAL, &value, NULL);
+
+    while (1) {
+	sleep(100000);
+    }
+}
+
+int
+set_realtime()
+{
+    struct sched_param sp;
+
+    sp.sched_priority = sched_get_priority_max(SCHED_RR);
+    if (sched_setscheduler(0, SCHED_RR, &sp) >= 0) return TRUE;
+    perror("sched_setscheduler");
+    return FALSE;
+}
+
+int
+main(int ac, char **av)
+{
+    int             realtime = 0;
+    int		    parallel = 1;
+    int             warmup = 0;
+    int             repetitions = TRIES;
+    int             c;
+    char            buf[512];
+    timer_e	    what = USLEEP;
+    state_t         state;
+    char           *scheduler = "";
+    char           *mechanism = "usleep";
+    char           *usage = "[-r] [-u <method>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] usecs\nmethod=usleep|nanosleep|select|pselect|itimer\n";
+
+    realtime = 0;
+
+    while ((c = getopt(ac, av, "ru:W:N:")) != EOF) {
+	switch (c) {
+	case 'r':
+	    realtime = 1;
+	    break;
+	case 'u':
+	    if (strcmp(optarg, "usleep") == 0) {
+		what = USLEEP;
+		mechanism = "usleep";
+	    } else if (strcmp(optarg, "nanosleep") == 0) {
+		what = NANOSLEEP;
+		mechanism = "nanosleep";
+	    } else if (strcmp(optarg, "select") == 0) {
+		what = SELECT;
+		mechanism = "select";
+#ifdef _POSIX_SELECT
+	    } else if (strcmp(optarg, "pselect") == 0) {
+		what = PSELECT;
+		mechanism = "pselect";
+#endif /* _POSIX_SELECT */
+	    } else if (strcmp(optarg, "itimer") == 0) {
+		what = ITIMER;
+		mechanism = "itimer";
+	    } else {
+		lmbench_usage(ac, av, usage);
+	    }
+	    break;
+	case 'P':
+	    parallel = atoi(optarg);
+	    if (parallel <= 0) lmbench_usage(ac, av, usage);
+	    break;
+	case 'W':
+	    warmup = atoi(optarg);
+	    break;
+	case 'N':
+	    repetitions = atoi(optarg);
+	    break;
+	default:
+	    lmbench_usage(ac, av, usage);
+	    break;
+	}
+    }
+    if (optind != ac - 1) {
+	lmbench_usage(ac, av, usage);
+    }
+
+    state.usecs = bytes(av[optind]);
+    if (realtime && set_realtime()) scheduler = "realtime ";
+
+    switch (what) {
+    case USLEEP:
+	benchmp(NULL, bench_usleep, NULL, 
+		0, parallel, warmup, repetitions, &state);
+	break;
+    case NANOSLEEP:
+	benchmp(NULL, bench_nanosleep, NULL, 
+		0, parallel, warmup, repetitions, &state);
+	break;
+    case SELECT:
+	benchmp(NULL, bench_select, NULL, 
+		0, parallel, warmup, repetitions, &state);
+	break;
+#ifdef _POSIX_SELECT
+    case PSELECT:
+	benchmp(NULL, bench_pselect, NULL, 
+		0, parallel, warmup, repetitions, &state);
+	break;
+#endif /* _POSIX_SELECT */
+    case ITIMER:
+	benchmp(initialize, bench_itimer, NULL, 
+		0, parallel, warmup, repetitions, &state);
+	break;
+    default:
+	lmbench_usage(ac, av, usage);
+	break;
+    }
+    sprintf(buf, "%s%s %lu microseconds", scheduler, mechanism, state.usecs);
+    micro(buf, get_n());
+    return (0);
+}
diff --git a/performance/lmbench3/src/lib_debug.c b/performance/lmbench3/src/lib_debug.c
new file mode 100644
index 0000000..2be1852
--- /dev/null
+++ b/performance/lmbench3/src/lib_debug.c
@@ -0,0 +1,131 @@
+#include <math.h>
+#include "bench.h"
+#include "lib_debug.h"
+
+/*
+ * return micro-seconds / iteration at the the fraction point.
+ *
+ * some examples:
+ *	min = percent_point(values, size, 0.0)
+ * 	1st quartile = percent_point(values, size, 0.25)
+ * 	median = percent_point(values, size, 0.5)
+ * 	3rd quartile = percent_point(values, size, 0.75)
+ *	max = percent_point(values, size, 1.0)
+ *
+ * the data points in the results structure are sorted from
+ * largest to smallest, so we adjust the fraction accordingly.
+ */
+double
+percent_point(double fraction)
+{
+	double	t, r;
+	result_t* results = get_results();
+
+	t = (1.0 - fraction) * (results->N - 1);
+	if (t == floor(t)) {
+		/* no interpolation */
+		r = results->v[(int)t].u / (double)results->v[(int)t].n;
+	} else {
+		/* percent point falls between two points, interpolate */
+		r = results->v[(int)t].u / (double)results->v[(int)t].n;
+		r += results->v[(int)t+1].u / (double)results->v[(int)t+1].n;
+		r /= 2.0;
+	}
+
+	return r;
+}
+
+void
+print_results(int details)
+{
+	int	i;
+	result_t* results = get_results();
+
+	fprintf(stderr, "N=%d, t={", results->N);
+	for (i = 0; i < results->N; ++i) {
+		fprintf(stderr, "%.2f", (double)results->v[i].u/results->v[i].n);
+		if (i < results->N - 1) 
+			fprintf(stderr, ", ");
+	}
+	fprintf(stderr, "}\n");
+	if (details) {
+		fprintf(stderr, "\t/* {", results->N);
+		for (i = 0; i < results->N; ++i) {
+			fprintf(stderr, 
+				"%llu/%llu", results->v[i].u, results->v[i].n);
+			if (i < results->N - 1)
+				fprintf(stderr, ", ");
+		}
+		fprintf(stderr, "} */\n");
+	}
+		
+}
+
+/*
+ * Prints bandwidth (MB/s) quartile information
+ *
+ * bytes - bytes per iteration
+ */
+void
+bw_quartile(uint64 bytes)
+{
+	double	b = (double)bytes;
+
+	fprintf(stderr, "%d\t%e\t%e\t%e\t%e\t%e\n", get_n(), 
+		(double)bytes / (1000000. * percent_point(0.00)),
+		(double)bytes / (1000000. * percent_point(0.25)),
+		(double)bytes / (1000000. * percent_point(0.50)),
+		(double)bytes / (1000000. * percent_point(0.75)),
+		(double)bytes / (1000000. * percent_point(1.00)));
+}
+
+/*
+ * Prints latency (nano-seconds) quartile information
+ *
+ * n - number of operations per iteration
+ */
+void
+nano_quartile(uint64 n)
+{
+	fprintf(stderr, "%d\t%e\t%e\t%e\t%e\t%e\n", get_n(), 
+		percent_point(0.00) * 1000. / (double)n,
+		percent_point(0.25) * 1000. / (double)n,
+		percent_point(0.50) * 1000. / (double)n,
+		percent_point(0.75) * 1000. / (double)n,
+		percent_point(1.00) * 1000. / (double)n);
+}
+
+/*
+ * print the page|line|word offset for each link in the pointer chain.
+ */
+void
+print_mem(char* addr, size_t size, size_t line)
+{
+	char*	p;
+	uint64	base, off;
+	size_t	pagesize = getpagesize();
+
+	base = (uint64)addr;
+	for (p = addr; *(char**)p != addr; p = *(char**)p) {
+		off = (uint64)p - base;
+		fprintf(stderr, "\t%lu\t%lu\t%lu\n", off / pagesize, 
+			(off % pagesize) / line, (off % line) / sizeof(char*));
+	}
+}
+
+void
+check_mem(char* addr, size_t size)
+{
+	char*	p;
+	size_t	i;
+	size_t	max = size / sizeof(char*) + 1;
+
+	for (p=addr, i=0; *(char**)p != addr && i < max; p = *(char**)p, i++) {
+		if (p < addr || addr + size <= p) {
+			fprintf(stderr, "check_mem: pointer out of range!\n");
+		}
+	}
+	if (*(char**)p != addr) {
+		fprintf(stderr, "check_mem: pointer chain doesn't loop\n");
+	}
+}
diff --git a/performance/lmbench3/src/lib_debug.h b/performance/lmbench3/src/lib_debug.h
new file mode 100644
index 0000000..3e1b682
--- /dev/null
+++ b/performance/lmbench3/src/lib_debug.h
@@ -0,0 +1,10 @@
+#ifndef _LIB_DEBUG_H
+#define _LIB_DEBUG_H
+
+void	print_results(int details);
+void	bw_quartile(uint64 bytes);
+void	nano_quartile(uint64 n);
+void	print_mem(char* addr, size_t size, size_t line);
+void	check_mem(char* addr, size_t size);
+
+#endif /* _LIB_DEBUG_H */
diff --git a/performance/lmbench3/src/lib_mem.c b/performance/lmbench3/src/lib_mem.c
new file mode 100644
index 0000000..3bdd4dc
--- /dev/null
+++ b/performance/lmbench3/src/lib_mem.c
@@ -0,0 +1,699 @@
+/*
+ * lib_mem.c - library of routines used to analyze the memory hierarchy
+ *
+ * @(#)lib_mem.c 1.15 staelin@xxxxxxxxxxxxxxxxxxxxxxxx
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  
+ * Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+
+#include "bench.h"
+
+#define	FIVE(m)		m m m m m
+#define	TEN(m)		FIVE(m) FIVE(m)
+#define	FIFTY(m)	TEN(m) TEN(m) TEN(m) TEN(m) TEN(m)
+#define	HUNDRED(m)	FIFTY(m) FIFTY(m)
+
+#define DEREF(N)	p##N = (char**)*p##N;
+#define DECLARE(N)	static char **sp##N; register char **p##N;
+#define INIT(N)		p##N = (mem_benchmark_rerun && addr_save==state->addr) ? sp##N : (char**)state->p[N];
+#define SAVE(N)		sp##N = p##N;
+
+#define MEM_BENCHMARK_F(N) mem_benchmark_##N,
+benchmp_f mem_benchmarks[] = {REPEAT_15(MEM_BENCHMARK_F)};
+
+static int mem_benchmark_rerun = 0;
+
+#define MEM_BENCHMARK_DEF(N,repeat,body) 				\
+void									\
+mem_benchmark_##N(iter_t iterations, void *cookie)			\
+{									\
+	struct mem_state* state = (struct mem_state*)cookie;		\
+	static char *addr_save = NULL;					\
+	repeat(DECLARE);						\
+									\
+	repeat(INIT);							\
+	while (iterations-- > 0) {					\
+		HUNDRED(repeat(body));					\
+	}								\
+									\
+	repeat(SAVE);							\
+	addr_save = state->addr;					\
+	mem_benchmark_rerun = 1;					\
+}
+
+MEM_BENCHMARK_DEF(0, REPEAT_0, DEREF)
+MEM_BENCHMARK_DEF(1, REPEAT_1, DEREF)
+MEM_BENCHMARK_DEF(2, REPEAT_2, DEREF)
+MEM_BENCHMARK_DEF(3, REPEAT_3, DEREF)
+MEM_BENCHMARK_DEF(4, REPEAT_4, DEREF)
+MEM_BENCHMARK_DEF(5, REPEAT_5, DEREF)
+MEM_BENCHMARK_DEF(6, REPEAT_6, DEREF)
+MEM_BENCHMARK_DEF(7, REPEAT_7, DEREF)
+MEM_BENCHMARK_DEF(8, REPEAT_8, DEREF)
+MEM_BENCHMARK_DEF(9, REPEAT_9, DEREF)
+MEM_BENCHMARK_DEF(10, REPEAT_10, DEREF)
+MEM_BENCHMARK_DEF(11, REPEAT_11, DEREF)
+MEM_BENCHMARK_DEF(12, REPEAT_12, DEREF)
+MEM_BENCHMARK_DEF(13, REPEAT_13, DEREF)
+MEM_BENCHMARK_DEF(14, REPEAT_14, DEREF)
+MEM_BENCHMARK_DEF(15, REPEAT_15, DEREF)
+
+
+size_t*	words_initialize(size_t max, int scale);
+
+
+void
+mem_reset()
+{
+	mem_benchmark_rerun = 0;
+}
+
+void
+mem_cleanup(iter_t iterations, void* cookie)
+{
+	struct mem_state* state = (struct mem_state*)cookie;
+
+	if (iterations) return;
+
+	if (state->addr) {
+		free(state->addr);
+		state->addr = NULL;
+	}
+	if (state->lines) {
+		free(state->lines);
+		state->lines = NULL;
+	}
+	if (state->pages) {
+		free(state->pages);
+		state->pages = NULL;
+	}
+	if (state->words) {
+		free(state->words);
+		state->words = NULL;
+	}
+}
+
+void
+tlb_cleanup(iter_t iterations, void* cookie)
+{
+	size_t i;
+	struct mem_state* state = (struct mem_state*)cookie;
+	char **addr = (char**)state->addr;
+
+	if (iterations) return;
+
+	if (addr) {
+		for (i = 0; i < state->npages; ++i) {
+			if (addr[i]) free(addr[i]);
+		}
+		free(addr);
+		state->addr = NULL;
+	}
+	if (state->pages) {
+		free(state->pages);
+		state->pages = NULL;
+	}
+	if (state->lines) {
+		free(state->lines);
+		state->lines = NULL;
+	}
+}
+
+void
+base_initialize(iter_t iterations, void* cookie)
+{
+	int	nwords, nlines, nbytes, npages, nmpages;
+	size_t *pages;
+	size_t *lines;
+	size_t *words;
+	struct mem_state* state = (struct mem_state*)cookie;
+	register char *p = 0 /* lint */;
+
+	if (iterations) return;
+
+	state->initialized = 0;
+
+	nbytes = state->len;
+	nwords = state->line / sizeof(char*);
+	nlines = state->pagesize / state->line;
+	npages = (nbytes + state->pagesize - 1) / state->pagesize;
+	nmpages= (state->maxlen + state->pagesize - 1) / state->pagesize;
+
+	srand(getpid());
+
+	words = NULL;
+	lines = NULL;
+	pages = permutation(nmpages, state->pagesize);
+	p = state->addr = (char*)malloc(state->maxlen + 2 * state->pagesize);
+
+	state->nwords = nwords;
+	state->nlines = nlines;
+	state->npages = npages;
+	state->lines = lines;
+	state->pages = pages;
+	state->words = words;
+
+	if (state->addr == NULL || pages == NULL)
+		return;
+
+	if ((unsigned long)p % state->pagesize) {
+		p += state->pagesize - (unsigned long)p % state->pagesize;
+	}
+	state->base = p;
+	state->initialized = 1;
+	mem_reset();
+}
+
+/*
+ * Create a circular list of pointers using a simple striding
+ * algorithm.  
+ * 
+ * This access pattern corresponds to many array/matrix
+ * algorithms.  It should be easily and correctly predicted
+ * by any decent hardware prefetch algorithm.
+ */
+void
+stride_initialize(iter_t iterations, void* cookie)
+{
+	struct mem_state* state = (struct mem_state*)cookie;
+	size_t	i;
+	size_t	range = state->len;
+	size_t	stride = state->line;
+	char*	addr;
+
+	base_initialize(iterations, cookie);
+	if (!state->initialized) return;
+	addr = state->base;
+
+	for (i = stride; i < range; i += stride) {
+		*(char **)&addr[i - stride] = (char*)&addr[i];
+	}
+	*(char **)&addr[i - stride] = (char*)&addr[0];
+	state->p[0] = addr;
+	mem_reset();
+}
+
+void
+thrash_initialize(iter_t iterations, void* cookie)
+{
+	struct mem_state* state = (struct mem_state*)cookie;
+	size_t	i;
+	size_t	j;
+	size_t	cur;
+	size_t	next;
+	size_t	cpage;
+	size_t	npage;
+	char*	addr;
+
+	base_initialize(iterations, cookie);
+	if (!state->initialized) return;
+	addr = state->base;
+
+	/*
+	 * Create a circular list of pointers with a random access
+	 * pattern.
+	 *
+	 * This stream corresponds more closely to linked list
+	 * memory access patterns.  For large data structures each
+	 * access will likely cause both a cache miss and a TLB miss.
+	 * 
+	 * Access a different page each time.  This will eventually
+	 * cause a tlb miss each page.  It will also cause maximal
+	 * thrashing in the cache between the user data stream and
+	 * the page table entries.
+	 */
+	if (state->len % state->pagesize) {
+		state->nwords = state->len / state->line;
+		state->words = words_initialize(state->nwords, state->line);
+		for (i = 0; i < state->nwords - 1; ++i) {
+			*(char **)&addr[state->words[i]] = (char*)&addr[state->words[i+1]];
+		}
+		*(char **)&addr[state->words[i]] = addr;
+		state->p[0] = addr;
+	} else {
+		state->nwords = state->pagesize / state->line;
+		state->words = words_initialize(state->nwords, state->line);
+
+		for (i = 0; i < state->npages - 1; ++i) {
+			cpage = state->pages[i];
+			npage = state->pages[i + 1];
+			for (j = 0; j < state->nwords; ++j) {
+				cur = cpage + state->words[(i + j) % state->nwords];
+				next = npage + state->words[(i + j + 1) % state->nwords];
+				*(char **)&addr[cur] = (char*)&addr[next];
+			}
+		}
+		cpage = state->pages[i];
+		npage = state->pages[0];
+		for (j = 0; j < state->nwords; ++j) {
+			cur = cpage + state->words[(i + j) % state->nwords];
+			next = npage + state->words[(j + 1) % state->nwords];
+			*(char **)&addr[cur] = (char*)&addr[next];
+		}
+		state->p[0] = (char*)&addr[state->pages[0]];
+	}
+	mem_reset();
+}
+
+/*
+ * mem_initialize
+ *
+ * Create a circular pointer chain that runs through memory.
+ *
+ * The chain threads through each cache line on a page before
+ * moving to the next page.  The order of cache line accesses
+ * is randomized to defeat cache prefetching algorithms.  In
+ * addition, the order of page accesses is randomized.  Finally,
+ * to reduce the impact of incorrect line-size estimates on
+ * machines with direct-mapped caches, we randomize which 
+ * word in the cache line is used to hold the pointer.
+ *
+ * It initializes state->width pointers to elements evenly
+ * spaced through the chain.
+ */
+void
+mem_initialize(iter_t iterations, void* cookie)
+{
+	int i, j, k, l, np, nw, nwords, nlines, nbytes, npages, npointers;
+	unsigned int r;
+	size_t    *pages;
+	size_t    *lines;
+	size_t    *words;
+	struct mem_state* state = (struct mem_state*)cookie;
+	register char *p = 0 /* lint */;
+
+	if (iterations) return;
+
+	base_initialize(iterations, cookie);
+	if (!state->initialized) return;
+	state->initialized = 0;
+
+	npointers = state->len / state->line;
+	nwords = state->nwords;
+	nlines = state->nlines;
+	npages = state->npages;
+	words = state->words = words_initialize(nwords, sizeof(char*));
+	lines = state->lines = words_initialize(nlines, state->line);
+	pages = state->pages;
+	p = state->base;
+
+	if (state->addr == NULL \
+	    || pages == NULL || lines == NULL || words == NULL) {
+		return;
+	}
+
+	/* setup the run through the pages */
+	l = 0;
+	for (i = 0; i < npages; ++i) {
+		for (j = 0; j < nlines - 1 && l < npointers - 1; ++j, ++l) {
+			for (k = 0; k < state->line; k += sizeof(char*)) {
+				*(char**)(p + pages[i] + lines[j] + k) =
+					p + pages[i] + lines[j+1] + k;
+			}
+			if (l % (npointers/state->width) == 0
+			    && l / (npointers/state->width) < MAX_MEM_PARALLELISM) {
+				k = l / (npointers/state->width);
+				state->p[k] = p + pages[i] + lines[j] + words[k % nwords];
+			}
+		}
+
+		if (i < npages - 1) {
+			for (k = 0; k < nwords; ++k) 
+				*(char**)(p + pages[i] + lines[j] + words[k]) =
+					p + pages[i+1] + lines[0] + words[k];
+		}
+	}
+	for (k = 0; k < nwords; ++k) {
+		nw = (k == nwords - 1) ? 0 : k + 1;
+		*(char**)(p + pages[npages-1] + lines[j] + words[k]) =
+			p + pages[0] + lines[0] + words[nw];
+	}
+
+	/* now, run through the chain once to clear the cache */
+	mem_reset();
+	(*mem_benchmarks[state->width-1])((nwords * npointers + 100) / 100, state);
+
+	state->initialized = 1;
+}
+
+/*
+ * line_initialize
+ *
+ * This is very similar to mem_initialize, except that we always use
+ * the first element of the cache line to hold the pointer.
+ *
+ */
+void
+line_initialize(iter_t iterations, void* cookie)
+{
+	int i, j, k, line, nlines, npages;
+	unsigned int r;
+	size_t    *pages;
+	size_t    *lines;
+	struct mem_state* state = (struct mem_state*)cookie;
+	register char *p = 0 /* lint */;
+
+	if (iterations) return;
+
+	base_initialize(iterations, cookie);
+	if (!state->initialized) return;
+	state->initialized = 0;
+
+	nlines = state->nlines;
+	npages = state->npages;
+	lines = state->lines = words_initialize(nlines, state->line);
+	pages = state->pages;
+	p = state->base;
+
+	state->width = 1;
+	
+	if (state->addr == NULL || lines == NULL || pages == NULL)
+		return;
+
+	/* new setup runs through the lines */
+	for (i = 0; i < npages; ++i) {
+		/* sequence through the first word of each line */
+		for (j = 0; j < nlines - 1; ++j) {
+			*(char**)(p + pages[i] + lines[j]) = 
+				p + pages[i] + lines[j+1];
+		}
+
+		/* jump to the fist word of the first line on next page */
+		*(char**)(p + pages[i] + lines[j]) = 
+			p + pages[(i < npages-1) ? i+1 : 0] + lines[0];
+	}
+	state->p[0] = p + pages[0] + lines[0];
+
+	/* now, run through the chain once to clear the cache */
+	mem_reset();
+	mem_benchmark_0((nlines * npages + 100) / 100, state);
+
+	state->initialized = 1;
+}
+
+/*
+ * tlb_initialize
+ *
+ * Build a pointer chain which accesses one word per page, for a total
+ * of (line * pages) bytes of data loaded into cache.  
+ *
+ * If the number of elements in the chain (== #pages) is larger than the
+ * number of pages addressed by the TLB, then each access should cause
+ * a TLB miss (certainly as the number of pages becomes much larger than
+ * the TLB-addressed space).
+ *
+ * In addition, if we arrange the chain properly, each word we access
+ * will be in the cache.
+ *
+ * This means that the average access time for each pointer dereference
+ * should be a cache hit plus a TLB miss.
+ *
+ */
+void
+tlb_initialize(iter_t iterations, void* cookie)
+{
+	int i, j, nwords, nlines, npages, pagesize;
+	unsigned int r;
+	char **pages = NULL;
+	char **addr = NULL;
+	size_t    *lines = NULL;
+	struct mem_state* state = (struct mem_state*)cookie;
+	register char *p = 0 /* lint */;
+
+	if (iterations) return;
+
+	state->initialized = 0;
+
+	pagesize = state->pagesize;
+	nwords   = 0;
+	nlines   = pagesize / sizeof(char*);
+	npages   = state->len / pagesize;
+
+	srand(getpid() ^ (getppid()<<7));
+
+	lines = words_initialize(nlines, sizeof(char*));
+	pages = (char**)malloc(npages * sizeof(char**));
+	addr = (char**)malloc(npages * sizeof(char**));
+
+	state->nwords = 1;
+	state->nlines = nlines;
+	state->npages = npages;
+	state->words = NULL;
+	state->lines = lines;
+	state->pages = (size_t*)pages;
+	state->addr = (char*)addr;
+	if (addr) bzero(addr, npages * sizeof(char**));
+	if (pages) bzero(pages, npages * sizeof(char**));
+
+	if (addr == NULL || pages == NULL || lines == NULL) {
+		return;
+	}
+
+	/* first, layout the sequence of page accesses */
+	for (i = 0; i < npages; ++i) {
+		p = addr[i] = (char*)valloc(pagesize);
+		if (p == NULL) return;
+		if ((unsigned long)p % pagesize) {
+			free(p);
+			p = addr[i] = (char*)valloc(2 * pagesize);
+			if (p == NULL) return;
+			p += pagesize - (unsigned long)p % pagesize;
+		}
+		pages[i] = (char*)p;
+	}
+
+	/* randomize the page sequences (except for zeroth page) */
+	r = (rand() << 15) ^ rand();
+	for (i = npages - 2; i > 0; --i) {
+		char* l;
+		r = (r << 1) ^ (rand() >> 4);
+		l = pages[(r % i) + 1];
+		pages[(r % i) + 1] = pages[i + 1];
+		pages[i + 1] = l;
+	}
+
+	/* now setup run through the pages */
+	for (i = 0; i < npages - 1; ++i) {
+		*(char**)(pages[i] + lines[i%nlines]) = 
+			pages[i+1] + lines[(i+1)%nlines];
+	}
+	*(char**)(pages[i] + lines[i%nlines]) = pages[0] + lines[0];
+	state->p[0] = pages[0] + lines[0];
+
+	/* run through the chain once to clear the cache */
+	mem_reset();
+	mem_benchmark_0((npages + 100) / 100, state);
+
+	state->initialized = 1;
+}
+
+/*
+ * words_initialize
+ *
+ * This is supposed to create the order in which the words in a 
+ * "cache line" are used.  Since we rarely know the cache line
+ * size with any real reliability, we need to jump around so
+ * as to maximize the number of potential cache misses, and to
+ * minimize the possibility of re-using a cache line.
+ */
+size_t*
+words_initialize(size_t max, int scale)
+{
+	size_t	i, j, nbits;
+	size_t*	words = (size_t*)malloc(max * sizeof(size_t));
+
+	if (!words) return NULL;
+
+	bzero(words, max * sizeof(size_t));
+	for (i = max>>1, nbits = 0; i != 0; i >>= 1, nbits++)
+		;
+	for (i = 0; i < max; ++i) {
+		/* now reverse the bits */
+		for (j = 0; j < nbits; j++) {
+			if (i & (1<<j)) {
+				words[i] |= (1<<(nbits-j-1));
+			}
+		}
+		words[i] *= scale;
+	}
+	return words;
+}
+
+
+size_t
+line_find(size_t len, int warmup, int repetitions, struct mem_state* state)
+{
+	size_t 	i, j, big_jump, line;
+	size_t	maxline = getpagesize() / 16;
+	double	baseline, t;
+
+	big_jump = 0;
+	line = 0;
+
+	/*
+	fprintf(stderr, "line_find(%d, ...): entering\n", len);
+	/**/
+
+	state->width = 1;
+	state->line = sizeof(char*);
+	for (state->addr = NULL; !state->addr && len; ) {
+		state->len = state->maxlen = len;
+		line_initialize(0, state);
+		if (state->addr == NULL) len >>= 1;
+	}
+	if (state->addr == NULL) return -1;
+
+	for (i = sizeof(char*); i <= maxline; i<<=1) {
+		t = line_test(i, warmup, repetitions, state);
+
+		if (t == 0.) break;
+
+		if (i > sizeof(char*)) {
+			if (t > 1.3 * baseline) {
+				big_jump = 1;
+			} else if (big_jump && t < 1.15 * baseline) {
+				line = (i>>1);
+				break;
+			}
+		}
+		baseline = t;
+	}
+	mem_cleanup(0, state);
+	/*
+	fprintf(stderr, "line_find(%d, ...): returning %d\n", len, line);
+	/**/
+	return line;
+}
+
+double
+line_test(size_t line, int warmup, int repetitions, struct mem_state* state)
+{
+	size_t	i;
+	size_t	npages = state->npages;
+	size_t	nlines = state->pagesize / line;
+	double	t;
+	char*	p = state->base;
+	char*	first = p + state->pages[0] + state->lines[0];
+	char*	last = p + state->pages[npages-1] + state->lines[nlines-1];
+	result_t *r, *r_save;
+
+
+	/* only visit a subset of the lines in each page */
+	if (nlines < state->nlines) {
+		p = state->base;
+		for (i = 0; i < npages - 1; ++i) {
+			*(char**)(p + state->pages[i] + state->lines[nlines-1]) =
+				p + state->pages[i+1] + state->lines[0];
+		}
+		*(char**)(p + state->pages[npages-1] + state->lines[nlines-1]) =
+			p + state->pages[0] + state->lines[0];
+	}
+
+	r_save = get_results();
+	r = (result_t*)malloc(sizeof_result(repetitions));
+	insertinit(r);
+	p = first;
+	for (i = 0; i < repetitions; ++i) {
+		BENCH1(HUNDRED(p = *(char**)p;),0);
+		/*
+		fprintf(stderr, "%d\t%d\t%d\n", line, (int)gettime(), (int)get_n()); 
+		/**/
+		insertsort(gettime(), get_n(), r);
+	}
+	use_pointer(p);
+	set_results(r);
+	t = 10. * (double)gettime() / (double)get_n();
+	set_results(r_save);
+	free(r);
+	
+	/*
+	fprintf(stderr, "%d\t%.5f\t%d\n", line, t, state->len); 
+	/**/
+
+	/* fixup full path again */
+	if (nlines < state->nlines) {
+		p = state->base;
+		for (i = 0; i < npages - 1; ++i) {
+			*(char**)(p + 
+				  state->pages[i] + 
+				  state->lines[nlines-1]) =
+				p + 
+				state->pages[i] + 
+				state->lines[nlines];
+		}
+		*(char**)(p + 
+			  state->pages[npages-1] + 
+			  state->lines[nlines-1]) =
+			p + 
+			state->pages[npages-1] + 
+			state->lines[nlines];
+	}
+
+	return (t);
+}
+
+double
+par_mem(size_t len, int warmup, int repetitions, struct mem_state* state)
+{
+	int	i, j, k, n, __n;
+	double	baseline, max_par, par;
+
+	state->width = 1;
+	max_par = 1.;
+	__n = 1;
+
+	for (state->addr = NULL; !state->addr && len; ) {
+		state->len = state->maxlen = len;
+		mem_initialize(0, state);
+		if (state->addr == NULL) len >>= 1;
+	}
+	if (state->addr == NULL) return -1.;
+
+	for (i = 0; i < MAX_MEM_PARALLELISM; ++i) {
+		n = len / state->line;
+		for (j = 0; j <= i; j++) {
+			size_t nlines = len / state->line;
+			size_t lines_per_chunk = nlines / (i + 1);
+			size_t lines_per_page = state->pagesize / state->line;
+			size_t words_per_chunk = state->nwords / (i + 1);
+			size_t line = j * lines_per_chunk;
+			size_t word = (j * state->nwords) / (i + 1);
+
+			/*
+			if (state->len == 32768 && i == 7) {
+				fprintf(stderr, "\tj=%d, line=%d, word=%d, page=%d, _line=%d, _word=%d\n", j, line, word, line / lines_per_page, line % lines_per_page, word % state->nwords);
+			}
+			/**/
+			state->p[j] = state->base + 
+				state->pages[line / lines_per_page] + 
+				state->lines[line % lines_per_page] + 
+				state->words[word % state->nwords];
+		}
+		mem_reset();
+		(*mem_benchmarks[i])((len / sizeof(char*) + 100) / 100, state);
+		BENCH((*mem_benchmarks[i])(__n, state); __n = 1;, 0);
+		if (i == 0) {
+			baseline = (double)gettime() / (double)get_n();
+		} else if (gettime() > 0) {
+			par = baseline;
+			par /= (double)gettime() / (double)((i + 1) * get_n());
+			/*
+			fprintf(stderr, "par_mem(%d): i=%d, p=%5.2f, l=%d, lpp=%d, lpc=%d, nl=%d, wpc=%d\n", len, i, par, state->line, state->pagesize / state->line, (len / state->line) / (i + 1), len / state->line, state->nwords / (i + 1));
+			/**/
+			if (par > max_par) {
+				max_par = par;
+			}
+		}
+	}
+	mem_cleanup(0, state);
+
+	return max_par;
+}
+
+
diff --git a/performance/lmbench3/src/lib_mem.h b/performance/lmbench3/src/lib_mem.h
new file mode 100644
index 0000000..5268515
--- /dev/null
+++ b/performance/lmbench3/src/lib_mem.h
@@ -0,0 +1,60 @@
+#ifndef LMBENCH_MEM_H
+#define LMBENCH_MEM_H
+
+
+#define MAX_MEM_PARALLELISM 16
+#define MEM_BENCHMARK_DECL(N) \
+	void mem_benchmark_##N(iter_t iterations, void* cookie);
+
+#define REPEAT_0(m)	m(0)
+#define REPEAT_1(m)	REPEAT_0(m) m(1)
+#define REPEAT_2(m)	REPEAT_1(m) m(2)
+#define REPEAT_3(m)	REPEAT_2(m) m(3)
+#define REPEAT_4(m)	REPEAT_3(m) m(4)
+#define REPEAT_5(m)	REPEAT_4(m) m(5)
+#define REPEAT_6(m)	REPEAT_5(m) m(6)
+#define REPEAT_7(m)	REPEAT_6(m) m(7)
+#define REPEAT_8(m)	REPEAT_7(m) m(8)
+#define REPEAT_9(m)	REPEAT_8(m) m(9)
+#define REPEAT_10(m)	REPEAT_9(m) m(10)
+#define REPEAT_11(m)	REPEAT_10(m) m(11)
+#define REPEAT_12(m)	REPEAT_11(m) m(12)
+#define REPEAT_13(m)	REPEAT_12(m) m(13)
+#define REPEAT_14(m)	REPEAT_13(m) m(14)
+#define REPEAT_15(m)	REPEAT_14(m) m(15)
+
+struct mem_state {
+	char*	addr;	/* raw pointer returned by malloc */
+	char*	base;	/* page-aligned pointer */
+	char*	p[MAX_MEM_PARALLELISM];
+	int	initialized;
+	int	width;
+	size_t	len;
+	size_t	maxlen;
+	size_t	line;
+	size_t	pagesize;
+	size_t	nlines;
+	size_t	npages;
+	size_t	nwords;
+	size_t*	pages;
+	size_t*	lines;
+	size_t*	words;
+};
+
+void stride_initialize(iter_t iterations, void* cookie);
+void thrash_initialize(iter_t iterations, void* cookie);
+void mem_initialize(iter_t iterations, void* cookie);
+void line_initialize(iter_t iterations, void* cookie);
+void tlb_initialize(iter_t iterations, void* cookie);
+void mem_cleanup(iter_t iterations, void* cookie);
+void tlb_cleanup(iter_t iterations, void* cookie);
+
+REPEAT_15(MEM_BENCHMARK_DECL)
+extern benchmp_f mem_benchmarks[];
+
+size_t	line_find(size_t l, int warmup, int repetitions, struct mem_state* state);
+double	line_test(size_t l, int warmup, int repetitions, struct mem_state* state);
+double	par_mem(size_t l, int warmup, int repetitions, struct mem_state* state);
+
+#endif /* LMBENCH_MEM_H */
+
diff --git a/performance/lmbench3/src/lib_sched.c b/performance/lmbench3/src/lib_sched.c
new file mode 100644
index 0000000..035925b
--- /dev/null
+++ b/performance/lmbench3/src/lib_sched.c
@@ -0,0 +1,239 @@
+#include "bench.h"
+
+/* #define _DEBUG */
+
+#if defined(HAVE_SYSMP)
+#include <sys/sysmp.h>
+#include <sys/sysinfo.h>
+#endif
+
+#if defined(HAVE_MPCTL)
+#include <sys/mpctl.h>
+#endif
+
+#if defined(HAVE_BINDPROCESSOR)
+#include <sys/processor.h>
+#endif
+
+#if defined(HAVE_PROCESSOR_BIND)
+#include <sys/types.h>
+#include <sys/processor.h>
+#include <sys/procset.h>
+#endif
+
+#if defined(HAVE_SCHED_SETAFFINITY)
+#include <sched.h>
+#endif
+
+extern int custom(char* str, int cpu);
+extern int reverse_bits(int cpu);
+extern int sched_ncpus();
+extern int sched_pin(int cpu);
+
+/*
+ * The interface used by benchmp.
+ *
+ * childno is the "logical" child id number.  
+ *	In range [0, ..., parallel-1].
+ * benchproc is the "logical" id within the benchmark process.  The
+ *	benchmp-created process is logical ID zero, child processes
+ *	created by the benchmark range from [1, ..., nbenchprocs].
+ * nbenchprocs is the number of child processes that each benchmark
+ * 	process will create.  Most benchmarks will leave this zero,
+ *	but some such as the pipe() benchmarks will not.
+ */
+int
+handle_scheduler(int childno, int benchproc, int nbenchprocs)
+{
+	int	cpu = 0;
+	char*	sched = getenv("LMBENCH_SCHED");
+	
+	if (!sched || strcasecmp(sched, "DEFAULT") == 0) {
+		/* do nothing.  Allow scheduler to control placement */
+		return 0;
+	} else if (strcasecmp(sched, "SINGLE") == 0) {
+		/* assign all processes to CPU 0 */
+		cpu = 0;
+	} else if (strcasecmp(sched, "BALANCED") == 0) {
+		/* assign each benchmark process to its own processor,
+		 * but child processes will share the CPU with the
+		 * parent.
+		 */
+		cpu = childno;
+	} else if (strcasecmp(sched, "BALANCED_SPREAD") == 0) {
+		/* 
+		 * assign each benchmark process to its own processor, 
+		 * logically as far away from neighboring IDs as 
+		 * possible.  This can help identify bus contention
+		 * issues in SMPs with hierarchical busses or NUMA
+		 * memory.
+		 */
+		cpu = reverse_bits(childno);
+	} else if (strcasecmp(sched, "UNIQUE") == 0) {
+		/*
+		 * assign each benchmark process and each child process
+		 * to its own processor.
+		 */
+		cpu = childno * (nbenchprocs + 1) + benchproc;
+	} else if (strcasecmp(sched, "UNIQUE_SPREAD") == 0) {
+		/* 
+		 * assign each benchmark process and each child process
+		 * to its own processor, logically as far away from 
+		 * neighboring IDs as possible.  This can help identify 
+		 * bus contention issues in SMPs with hierarchical busses
+		 * or NUMA memory.
+		 */
+		cpu = reverse_bits(childno * (nbenchprocs + 1) + benchproc);
+	} else if (strncasecmp(sched, "CUSTOM ", strlen("CUSTOM ")) == 0) {
+		cpu = custom(sched + strlen("CUSTOM"), childno);
+	} else if (strncasecmp(sched, "CUSTOM_UNIQUE ", strlen("CUSTOM_UNIQUE ")) == 0) {
+		cpu = custom(sched + strlen("CUSTOM_UNIQUE"), 
+			     childno * (nbenchprocs + 1) + benchproc);
+	} else {
+		/* default action: do nothing */
+		return;
+	}
+
+	return sched_pin(cpu % sched_ncpus());
+}
+
+/*
+ * Use to get sequentially created processes "far" away from
+ * each other in an SMP.
+ *
+ * XXX: probably doesn't work for NCPUS not a power of two.
+ */
+int
+reverse_bits(int cpu)
+{
+	int	i;
+	int	nbits;
+	int	max = sched_ncpus() - 1;
+	int	cpu_reverse = 0;
+
+	for (i = max>>1, nbits = 1; i > 0; i >>= 1, nbits++)
+	  ;
+	/* now reverse the bits */
+	for (i = 0; i < nbits; i++) {
+		if (cpu & (1<<i))
+			cpu_reverse |= (1<<(nbits-i-1));
+	}
+	return cpu_reverse;
+}
+
+/*
+ * Custom is a user-defined sequence of CPU ids
+ */
+int
+custom(char* str, int cpu)
+{
+	static int nvalues = -1;
+	static int* values = NULL;
+
+	if (values == NULL) {
+		nvalues = 0;
+		values = (int*)malloc(sizeof(int));
+	
+		while (*str) {
+			char* q;
+			while (*str && !isdigit(*str)) str++;
+			q = str;
+			while (*str && isdigit(*str)) str++;
+			if (str == q) break;
+			*str++ = 0;
+			sscanf(q, "%d", &values[nvalues++]);
+			values = (int*)realloc((void*)values, (nvalues + 1) * sizeof(int));
+		}
+	}
+	if (nvalues == 0) return 0;
+	return values[cpu % nvalues];
+}
+
+/*
+ * Return the number of processors in this host
+ */
+int
+sched_ncpus()
+{
+#ifdef MP_NPROCS
+	/* SGI IRIX interface */
+	return sysmp(MP_NPROCS);
+#elif defined(HAVE_MPCTL)
+	/* HP-UX interface */
+	return mpctl(MPC_GETNUMSPUS_SYS, 0, 0);
+#elif defined(_SC_NPROCESSORS_ONLN)
+	/* AIX, Solaris, and Linux interface */
+	return sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+	return 1;
+}
+
+/*
+ * Pin the current process to the given CPU
+ *
+ * return 0 when successful
+ * returns -1 on error
+ */
+int
+sched_pin(int cpu)
+{
+	int retval = -1;
+
+#ifdef HAVE_SYSMP
+	/* SGI IRIX interface */
+	retval = sysmp(MP_MUSTRUN, cpu);
+#elif defined(HAVE_MPCTL)
+	/* HP-UX interface */
+	retval = mpctl(MPC_SET_PROCESS, cpu, MPC_SELFPID);
+#elif defined(HAVE_BINDPROCESSOR)
+	/* AIX interface */
+	retval =  bindprocessor(BINDPROCESS, getpid(), cpu);
+#elif defined(HAVE_PROCESSOR_BIND)
+	/* Solaris interface */
+	retval = processor_bind(P_PID, P_MYPID, cpu, NULL);
+#elif defined(HAVE_SCHED_SETAFFINITY)
+	/* Linux interface */
+	static unsigned long* mask = NULL;
+	static unsigned long* cpumask = NULL;
+	static int sz = 0;
+	static int ncpus = 0;
+	int i;
+	int j;
+	
+	if (cpumask == NULL) {
+		sz = 1 + (2 * sched_ncpus()) / (8 * sizeof(unsigned long));
+		mask = (unsigned long*)malloc(sz * sizeof(unsigned long));
+		cpumask = (unsigned long*)malloc(sz * sizeof(unsigned long));
+		retval = sched_getaffinity(0, sz * sizeof(unsigned long), cpumask);
+		if (retval < 0) perror("sched_getaffinity:");
+		if (retval < 0) return retval;
+
+		for (i = 0; i < sz * 8 * sizeof(unsigned long); ++i) {
+			int	word = i / (8 * sizeof(unsigned long));
+			int	bit = i % (8 * sizeof(unsigned long));
+			if (cpumask[word] & (1 << bit)) ncpus++;
+		}
+	}
+	cpu %= ncpus;
+
+	bzero(mask, sz * sizeof(unsigned long));
+	for (i = 0, j = 0; i < sz * 8 * sizeof(unsigned long); ++i) {
+		int	word = i / (8 * sizeof(unsigned long));
+		int	bit = i % (8 * sizeof(unsigned long));
+		if (cpumask[word] & (1 << bit)) {
+			if (j >= cpu) {
+				mask[word] |= (1 << bit);
+				break;
+			}
+			j++;
+		}
+	}
+	retval = sched_setaffinity(0, sz * sizeof(unsigned long), mask);
+	if (retval < 0) perror("sched_setaffinity:");
+#ifdef _DEBUG
+	fprintf(stderr, "sched_pin(%d): pid=%d, returning %d\n", cpu, (int)getpid(), retval);
+#endif /* _DEBUG */
+
+#endif
+	return retval;
+}
diff --git a/performance/lmbench3/src/lib_stats.c b/performance/lmbench3/src/lib_stats.c
new file mode 100644
index 0000000..cc8b5a6
--- /dev/null
+++ b/performance/lmbench3/src/lib_stats.c
@@ -0,0 +1,603 @@
+#include <math.h>
+#include "bench.h"
+
+#define BOOTSTRAP_COUNT	200
+
+/*
+ * a comparison function used by qsort
+ */
+int
+int_compare(const void *a, const void *b)
+{
+	if (*(int*)a < *(int*)b) return -1;
+	if (*(int*)a > *(int*)b) return 1;
+	return 0;
+}
+
+/*
+ * a comparison function used by qsort
+ */
+int
+uint64_compare(const void *a, const void *b)
+{
+	if (*(uint64*)a < *(uint64*)b) return -1;
+	if (*(uint64*)a > *(uint64*)b) return  1;
+	return 0;
+}
+
+/*
+ * a comparison function used by qsort
+ */
+int
+double_compare(const void *a, const void *b)
+{
+	if (*(double*)a < *(double*)b) return -1;
+	if (*(double*)a > *(double*)b) return 1;
+	return 0;
+}
+
+/*
+ * return the median value of an array of int
+ */
+int
+int_median(int *values, int size)
+{
+	qsort(values, size, sizeof(int), int_compare);
+
+	if (size == 0) return 0.;
+
+	if (size % 2) {
+	    return values[size/2];
+	}
+
+	return (values[size/2 - 1] + values[size/2]) / 2;
+}
+
+/*
+ * return the median value of an array of int
+ */
+uint64
+uint64_median(uint64 *values, int size)
+{
+	qsort(values, size, sizeof(uint64), uint64_compare);
+
+	if (size == 0) return 0.;
+
+	if (size % 2) {
+	    return values[size/2];
+	}
+
+	return (values[size/2 - 1] + values[size/2]) / 2;
+}
+
+/*
+ * return the median value of an array of doubles
+ */
+double
+double_median(double *values, int size)
+{
+	qsort(values, size, sizeof(double), double_compare);
+
+	if (size == 0) return 0.;
+
+	if (size % 2) {
+	    return values[size/2];
+	}
+
+	return (values[size/2 - 1] + values[size/2]) / 2.0;
+}
+
+/*
+ * return the mean value of an array of int
+ */
+int
+int_mean(int *values, int size)
+{
+	int	i;
+	int	sum = 0;
+
+	for (i = 0; i < size; ++i)
+		sum += values[i];
+
+	return sum / size;
+}
+
+/*
+ * return the mean value of an array of int
+ */
+uint64
+uint64_mean(uint64 *values, int size)
+{
+	int	i;
+	uint64	sum = 0;
+
+	for (i = 0; i < size; ++i)
+		sum += values[i];
+
+	return sum / size;
+}
+
+/*
+ * return the mean value of an array of doubles
+ */
+double
+double_mean(double *values, int size)
+{
+	int	i;
+	double	sum = 0.0;
+
+	for (i = 0; i < size; ++i)
+		sum += values[i];
+
+	return sum / (double)size;
+}
+
+/*
+ * return the min value of an array of int
+ */
+int
+int_min(int *values, int size)
+{
+	int	i;
+	int	min = values[0];
+
+	for (i = 1; i < size; ++i)
+		if (values[i] < min) min = values[i];
+
+	return min;
+}
+
+/*
+ * return the min value of an array of int
+ */
+uint64
+uint64_min(uint64 *values, int size)
+{
+	int	i;
+	uint64	min = values[0];
+
+	for (i = 1; i < size; ++i)
+		if (values[i] < min) min = values[i];
+
+	return min;
+}
+
+/*
+ * return the min value of an array of doubles
+ */
+double
+double_min(double *values, int size)
+{
+	int	i;
+	double	min = values[0];
+
+	for (i = 1; i < size; ++i)
+		if (values[i] < min) min = values[i];
+
+	return min;
+}
+
+/*
+ * return the max value of an array of int
+ */
+int
+int_max(int *values, int size)
+{
+	int	i;
+	int	max = values[0];
+
+	for (i = 1; i < size; ++i)
+		if (values[i] > max) max = values[i];
+
+	return max;
+}
+
+/*
+ * return the max value of an array of int
+ */
+uint64
+uint64_max(uint64 *values, int size)
+{
+	int	i;
+	uint64	max = values[0];
+
+	for (i = 1; i < size; ++i)
+		if (values[i] > max) max = values[i];
+
+	return max;
+}
+
+/*
+ * return the max value of an array of doubles
+ */
+double
+double_max(double *values, int size)
+{
+	int	i;
+	double	max = values[0];
+
+	for (i = 1; i < size; ++i)
+		if (values[i] > max) max = values[i];
+
+	return max;
+}
+
+/*
+ * return the variance of an array of ints
+ *
+ * Reference: "Statistics for Experimenters" by
+ * 	George E.P. Box et. al., page 41
+ */
+double	int_variance(int *values, int size)
+{
+	int	i;
+	double	sum = 0.0;
+	int	mean = int_mean(values, size);
+
+	for (i = 0; i < size; ++i)
+		sum += (double)((values[i] - mean) * (values[i] - mean));
+
+	return sum / (double)(size - 1);
+}
+
+/*
+ * return the variance of an array of uint64s
+ */
+double	uint64_variance(uint64 *values, int size)
+{
+	int	i;
+	double	sum = 0.0;
+	uint64	mean = uint64_mean(values, size);
+
+	for (i = 0; i < size; ++i)
+		sum += (double)((values[i] - mean) * (values[i] - mean));
+	return sum / (double)(size - 1);
+}
+
+/*
+ * return the variance of an array of doubles
+ */
+double	double_variance(double *values, int size)
+{
+	int	i;
+	double	sum = 0.0;
+	double	mean = double_mean(values, size);
+
+	for (i = 0; i < size; ++i)
+		sum += (double)((values[i] - mean) * (values[i] - mean));
+
+	return sum / (double)(size - 1);
+}
+
+/*
+ * return the moment of an array of ints
+ *
+ * Reference: "Statistics for Experimenters" by
+ * 	George E.P. Box et. al., page 41, 90
+ */
+double	int_moment(int moment, int *values, int size)
+{
+	int	i, j;
+	double	sum = 0.0;
+	int	mean = int_mean(values, size);
+
+	for (i = 0; i < size; ++i) {
+		double diff = values[i] - mean;
+		double m = diff;
+		for (j = 1; j < moment; ++j)
+			m *= diff;
+		sum += m;
+	}
+
+	return sum / (double)size;
+}
+
+/*
+ * return the moment of an array of uint64s
+ */
+double	uint64_moment(int moment, uint64 *values, int size)
+{
+	int	i, j;
+	double	sum = 0.0;
+	uint64	mean = uint64_mean(values, size);
+
+	for (i = 0; i < size; ++i) {
+		double diff = values[i] - mean;
+		double m = diff;
+		for (j = 1; j < moment; ++j)
+			m *= diff;
+		sum += m;
+	}
+
+	return sum / (double)size;
+}
+
+/*
+ * return the moment of an array of doubles
+ */
+double	double_moment(int moment, double *values, int size)
+{
+	int	i, j;
+	double	sum = 0.0;
+	double	mean = double_mean(values, size);
+
+	for (i = 0; i < size; ++i) {
+		double diff = values[i] - mean;
+		double m = diff;
+		for (j = 1; j < moment; ++j)
+			m *= diff;
+		sum += m;
+	}
+
+	return sum / (double)size;
+}
+
+/*
+ * return the standard error of an array of ints
+ *
+ * Reference: "Statistics for Experimenters" by
+ * 	George E.P. Box et. al., page 41
+ */
+double	int_stderr(int *values, int size)
+{
+	return sqrt(int_variance(values, size));
+}
+
+/*
+ * return the standard error of an array of uint64s
+ */
+double	uint64_stderr(uint64 *values, int size)
+{
+	return sqrt(uint64_variance(values, size));
+}
+
+/*
+ * return the standard error of an array of doubles
+ */
+double	double_stderr(double *values, int size)
+{
+	return sqrt(double_variance(values, size));
+}
+
+/*
+ * return the skew of an array of ints
+ *
+ */
+double	int_skew(int *values, int size)
+{
+	double	sigma = int_stderr(values, size);
+	double	moment3 = int_moment(3, values, size);
+
+	return moment3 / (sigma * sigma * sigma);
+}
+
+/*
+ * return the skew of an array of uint64s
+ */
+double	uint64_skew(uint64 *values, int size)
+{
+	double	sigma = uint64_stderr(values, size);
+	double	moment3 = uint64_moment(3, values, size);
+
+	return moment3 / (sigma * sigma * sigma);
+}
+
+/*
+ * return the skew of an array of doubles
+ */
+double	double_skew(double *values, int size)
+{
+	double	sigma = double_stderr(values, size);
+	double	moment3 = double_moment(3, values, size);
+
+	return moment3 / (sigma * sigma * sigma);
+}
+
+/*
+ * return the kurtosis of an array of ints
+ *
+ * Reference: "Statistics for Experimenters" by
+ * 	George E.P. Box et. al., page 90;
+ */
+double	int_kurtosis(int *values, int size)
+{
+	double	variance = int_variance(values, size);
+	double	moment4 = int_moment(4, values, size);
+
+	return moment4 / (variance * variance) - 3;
+}
+
+/*
+ * return the kurtosis of an array of uint64s
+ */
+double	uint64_kurtosis(uint64 *values, int size)
+{
+	double	variance = uint64_variance(values, size);
+	double	moment4 = uint64_moment(4, values, size);
+
+	return moment4 / (variance * variance) - 3;
+}
+
+/*
+ * return the kurtosis of an array of doubles
+ */
+double	double_kurtosis(double *values, int size)
+{
+	double	variance = double_variance(values, size);
+	double	moment4 = double_moment(4, values, size);
+
+	return moment4 / (variance * variance) - 3;
+}
+
+/*
+ * BOOTSTRAP:
+ *
+ * stderr = sqrt(sum_i(s[i] - sum_j(s[j])/B)**2 / (B - 1))
+ *
+ * Reference: "An Introduction to the Bootstrap" by Bradley
+ *	Efron and Robert J. Tibshirani, page 12.
+ */
+
+/*
+ * return the bootstrap estimation of the standard error 
+ * of an array of ints
+ */
+double	int_bootstrap_stderr(int *values, int size, int_stat f)
+{
+	int	i, j;
+	int    *samples = (int*)malloc(size * sizeof(int));
+	double *s = (double*)malloc(BOOTSTRAP_COUNT * sizeof(double));
+	double	s_sum = 0;
+	double	sum = 0;
+
+	/* generate the stderr for each of the bootstrap samples */
+	for (i = 0; i < BOOTSTRAP_COUNT; ++i) {
+		for (j = 0; j < size; ++j)
+			samples[j] = values[rand() % size];
+		s[i] = (double)(*f)(samples, size);
+		s_sum += s[i];	/* CHS: worry about overflow */
+	}
+	s_sum /= (double)BOOTSTRAP_COUNT;
+	
+	for (i = 0; i < BOOTSTRAP_COUNT; ++i) 
+		sum += (s[i] - s_sum) * (s[i] - s_sum);
+
+	sum /= (double)(BOOTSTRAP_COUNT - 1);
+
+	free(samples);
+	free(s);
+
+	return sqrt(sum);
+}
+
+/*
+ * return the bootstrap estimation of the standard error 
+ * of an array of uint64s
+ */
+double	uint64_bootstrap_stderr(uint64 *values, int size, uint64_stat f)
+{
+	int	i, j;
+	uint64 *samples = (uint64*)malloc(size * sizeof(uint64));
+	double *s = (double*)malloc(BOOTSTRAP_COUNT * sizeof(double));
+	double	s_sum;
+	double	sum;
+
+	/* generate the stderr for each of the bootstrap samples */
+	for (i = 0, s_sum = 0.0; i < BOOTSTRAP_COUNT; ++i) {
+		for (j = 0; j < size; ++j) 
+			samples[j] = values[rand() % size];
+		s[i] = (double)(*f)(samples, size);
+		s_sum += s[i];	/* CHS: worry about overflow */
+	}
+	s_sum /= (double)BOOTSTRAP_COUNT;
+	
+	for (i = 0, sum = 0.0; i < BOOTSTRAP_COUNT; ++i) 
+		sum += (s[i] - s_sum) * (s[i] - s_sum);
+
+	free(samples);
+	free(s);
+
+	return sqrt(sum/(double)(BOOTSTRAP_COUNT - 1));
+}
+
+/*
+ * return the bootstrap estimation of the standard error 
+ * of an array of doubles
+ */
+double	double_bootstrap_stderr(double *values, int size, double_stat f)
+{
+	int	i, j;
+	double *samples = (double*)malloc(size * sizeof(double));
+	double *s = (double*)malloc(BOOTSTRAP_COUNT * sizeof(double));
+	double	s_sum = 0;
+	double	sum = 0;
+
+	/* generate the stderr for each of the bootstrap samples */
+	for (i = 0; i < BOOTSTRAP_COUNT; ++i) {
+		for (j = 0; j < size; ++j) 
+			samples[j] = values[rand() % size];
+		s[i] = (*f)(samples, size);
+		s_sum += (double)s[i];	/* CHS: worry about overflow */
+	}
+	s_sum /= (double)BOOTSTRAP_COUNT;
+	
+	for (i = 0; i < BOOTSTRAP_COUNT; ++i) 
+		sum += (s[i] - s_sum) * (s[i] - s_sum);
+
+	sum /= (double)(BOOTSTRAP_COUNT - 1);
+
+	free(samples);
+	free(s);
+
+	return sqrt(sum);
+}
+
+/*
+ * regression(x, y, sig, n, a, b, sig_a, sig_b, chi2)
+ *
+ * This routine is derived from equations in "Numerical Recipes in C" 
+ * (second edition) by Press, et. al.,  pages 661-665.
+ *
+ * compute the linear regression y = a + bx for (x,y), where y[i] has 
+ * standard deviation sig[i].
+ *
+ * returns the coefficients a and b, along with an estimation of their
+ * error (standard deviation) in sig_a and sig_b.
+ *
+ * returns chi2 for "goodness of fit" information.
+ */
+
+void
+regression(double *x, double *y, double *sig, int n,
+	   double *a, double *b, double *sig_a, double *sig_b, 
+	   double *chi2)
+{
+	int	i;
+	double	S = 0.0, Sx = 0.0, Sy = 0.0, Stt = 0.0, Sx_S;
+
+	/* compute some basic statistics */
+	for (i = 0; i < n; ++i) {
+		/* Equations 15.2.4: for S, Sx, Sy */
+		double	weight = 1.0 / (sig ? sig[i] * sig[i] : 1.0);
+		S += weight;
+		Sx += weight * x[i];
+		Sy += weight * y[i];
+	}
+
+	*b = 0.0;
+	Sx_S = Sx / S;
+	for (i = 0; i < n; ++i) {
+		/* 
+		 * Equation 15.2.15 for t
+		 * Equation 15.2.16 for Stt
+		 * Equation 15.2.17 for b, do summation portion of equation
+		 *	compute Sum i=0,n-1 (t_i * y[i] / sig[i]))
+		 */
+		double t_i = (x[i] - Sx_S) / (sig ? sig[i] : 1.0);
+		Stt += t_i * t_i;
+		*b  += t_i * y[i] / (sig ? sig[i] : 1.0);
+	}
+
+	/*
+	 * Equation 15.2.17 for b, do 1/Stt * summation
+	 * Equation 15.2.18 for a
+	 * Equation 15.2.19 for sig_a
+	 * Equation 15.2.20 for sig_b
+	 */
+	*b /= Stt;
+	*a = (Sy - *b * Sx) / S;
+	*sig_a = sqrt((1.0 + (Sx * Sx) / (S * Stt)) / S);
+	*sig_b = sqrt(1.0 / Stt);
+
+	/* Equation 15.2.2 for chi2, the merit function */
+	*chi2 = 0.0;
+	for (i = 0; i < n; ++i) {
+		double merit = (y[i] - ((*a) + (*b) * x[i])) / (sig ? sig[i] : 1.0);
+		*chi2 += merit * merit;
+	}
+	if (sig == NULL) {
+	  *sig_a *= sqrt((*chi2) / (n - 2));
+	  *sig_b *= sqrt((*chi2) / (n - 2));
+	}
+}
+
diff --git a/performance/lmbench3/src/lib_tcp.c b/performance/lmbench3/src/lib_tcp.c
new file mode 100644
index 0000000..d84a63e
--- /dev/null
+++ b/performance/lmbench3/src/lib_tcp.c
@@ -0,0 +1,238 @@
+/*
+ * tcp_lib.c - routines for managing TCP connections.
+ *
+ * Positive port/program numbers are RPC ports, negative ones are TCP ports.
+ *
+ * Copyright (c) 1994-1996 Larry McVoy.
+ */
+#define		_LIB /* bench.h needs this */
+#include	"bench.h"
+
+/*
+ * Get a TCP socket, bind it, figure out the port,
+ * and advertise the port as program "prog".
+ *
+ * XXX - it would be nice if you could advertise ascii strings.
+ */
+int
+tcp_server(int prog, int rdwr)
+{
+	int	sock;
+	struct	sockaddr_in s;
+
+#ifdef	LIBTCP_VERBOSE
+	fprintf(stderr, "tcp_server(%u, %u)\n", prog, rdwr);
+#endif
+	if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+		perror("socket");
+		exit(1);
+	}
+	sock_optimize(sock, rdwr);
+	bzero((void*)&s, sizeof(s));
+	s.sin_family = AF_INET;
+	if (prog < 0) {
+		s.sin_port = htons(-prog);
+	}
+	if (bind(sock, (struct sockaddr*)&s, sizeof(s)) < 0) {
+		perror("bind");
+		exit(2);
+	}
+	if (listen(sock, 100) < 0) {
+		perror("listen");
+		exit(4);
+	}
+	if (prog > 0) {
+#ifdef	LIBTCP_VERBOSE
+		fprintf(stderr, "Server port %d\n", sockport(sock));
+#endif
+		(void)pmap_unset((u_long)prog, (u_long)1);
+		if (!pmap_set((u_long)prog, (u_long)1, (u_long)IPPROTO_TCP,
+		    (unsigned short)sockport(sock))) {
+			perror("pmap_set");
+			exit(5);
+		}
+	}
+	return (sock);
+}
+
+/*
+ * Unadvertise the socket
+ */
+int
+tcp_done(int prog)
+{
+	if (prog > 0) {
+		pmap_unset((u_long)prog, (u_long)1);
+	}
+	return (0);
+}
+
+/*
+ * Accept a connection and return it
+ */
+int
+tcp_accept(int sock, int rdwr)
+{
+	struct	sockaddr_in s;
+	int	newsock, namelen;
+
+	namelen = sizeof(s);
+	bzero((void*)&s, namelen);
+
+retry:
+	if ((newsock = accept(sock, (struct sockaddr*)&s, &namelen)) < 0) {
+		if (errno == EINTR)
+			goto retry;
+		perror("accept");
+		exit(6);
+	}
+#ifdef	LIBTCP_VERBOSE
+	fprintf(stderr, "Server newsock port %d\n", sockport(newsock));
+#endif
+	sock_optimize(newsock, rdwr);
+	return (newsock);
+}
+
+/*
+ * Connect to the TCP socket advertised as "prog" on "host" and
+ * return the connected socket.
+ *
+ * Hacked Thu Oct 27 1994 to cache pmap_getport calls.  This saves
+ * about 4000 usecs in loopback lat_connect calls.  I suppose we
+ * should time gethostbyname() & pmap_getprot(), huh?
+ */
+int
+tcp_connect(char *host, int prog, int rdwr)
+{
+	static	struct hostent *h;
+	static	struct sockaddr_in s;
+	static	u_short	save_port;
+	static	u_long save_prog;
+	static	char *save_host;
+	int	sock;
+	static	int tries = 0;
+
+	if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+		perror("socket");
+		exit(1);
+	}
+	if (rdwr & SOCKOPT_PID) {
+		static	unsigned short port;
+		struct sockaddr_in sin;
+
+		if (!port) {
+			port = (unsigned short)(getpid() << 4);
+			if (port < 1024) {
+				port += 1024;
+			}
+		}
+		do {
+			port++;
+			bzero((void*)&sin, sizeof(sin));
+			sin.sin_family = AF_INET;
+			sin.sin_port = htons(port);
+		} while (bind(sock, (struct sockaddr*)&sin, sizeof(sin)) == -1);
+	}
+#ifdef	LIBTCP_VERBOSE
+	else {
+		struct sockaddr_in sin;
+
+		bzero((void*)&sin, sizeof(sin));
+		sin.sin_family = AF_INET;
+		if (bind(sock, (struct sockaddr*)&sin, sizeof(sin)) < 0) {
+			perror("bind");
+			exit(2);
+		}
+	}
+	fprintf(stderr, "Client port %d\n", sockport(sock));
+#endif
+	sock_optimize(sock, rdwr);
+	if (!h || host != save_host || prog != save_prog) {
+		save_host = host;	/* XXX - counting on them not
+					 * changing it - benchmark only.
+					 */
+		save_prog = prog;
+		if (!(h = gethostbyname(host))) {
+			perror(host);
+			exit(2);
+		}
+		bzero((void *) &s, sizeof(s));
+		s.sin_family = AF_INET;
+		bcopy((void*)h->h_addr, (void *)&s.sin_addr, h->h_length);
+		if (prog > 0) {
+			save_port = pmap_getport(&s, prog,
+			    (u_long)1, IPPROTO_TCP);
+			if (!save_port) {
+				perror("lib TCP: No port found");
+				exit(3);
+			}
+#ifdef	LIBTCP_VERBOSE
+			fprintf(stderr, "Server port %d\n", save_port);
+#endif
+			s.sin_port = htons(save_port);
+		} else {
+			s.sin_port = htons(-prog);
+		}
+	}
+	if (connect(sock, (struct sockaddr*)&s, sizeof(s)) < 0) {
+		if (errno == ECONNRESET 
+		    || errno == ECONNREFUSED
+		    || errno == EAGAIN) {
+			close(sock);
+			if (++tries > 10) return(-1);
+			return (tcp_connect(host, prog, rdwr));
+		}
+		perror("connect");
+		exit(4);
+	}
+	tries = 0;
+	return (sock);
+}
+
+void
+sock_optimize(int sock, int flags)
+{
+	if (flags & SOCKOPT_READ) {
+		int	sockbuf = SOCKBUF;
+
+		while (setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &sockbuf,
+		    sizeof(int))) {
+			sockbuf >>= 1;
+		}
+#ifdef	LIBTCP_VERBOSE
+		fprintf(stderr, "sockopt %d: RCV: %dK\n", sock, sockbuf>>10);
+#endif
+	}
+	if (flags & SOCKOPT_WRITE) {
+		int	sockbuf = SOCKBUF;
+
+		while (setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &sockbuf,
+		    sizeof(int))) {
+			sockbuf >>= 1;
+		}
+#ifdef	LIBTCP_VERBOSE
+		fprintf(stderr, "sockopt %d: SND: %dK\n", sock, sockbuf>>10);
+#endif
+	}
+	if (flags & SOCKOPT_REUSE) {
+		int	val = 1;
+		if (setsockopt(sock, SOL_SOCKET,
+		    SO_REUSEADDR, &val, sizeof(val)) == -1) {
+			perror("SO_REUSEADDR");
+		}
+	}
+}
+
+int
+sockport(int s)
+{
+	int	namelen;
+	struct sockaddr_in sin;
+
+	namelen = sizeof(sin);
+	if (getsockname(s, (struct sockaddr *)&sin, &namelen) < 0) {
+		perror("getsockname");
+		return(-1);
+	}
+	return ((int)ntohs(sin.sin_port));
+}
diff --git a/performance/lmbench3/src/lib_tcp.h b/performance/lmbench3/src/lib_tcp.h
new file mode 100644
index 0000000..bc820b2
--- /dev/null
+++ b/performance/lmbench3/src/lib_tcp.h
@@ -0,0 +1,12 @@
+#include	<sys/types.h>
+#include	<sys/socket.h>
+#include	<netinet/in.h>
+#include	<netdb.h>
+#include	<arpa/inet.h>
+
+int	tcp_server(int prog, int rdwr);
+int	tcp_done(int prog);
+int	tcp_accept(int sock, int rdwr);
+int	tcp_connect(char *host, int prog, int rdwr);
+void	sock_optimize(int sock, int rdwr);
+int	sockport(int s);
diff --git a/performance/lmbench3/src/lib_timing.c b/performance/lmbench3/src/lib_timing.c
new file mode 100644
index 0000000..af8cf68
--- /dev/null
+++ b/performance/lmbench3/src/lib_timing.c
@@ -0,0 +1,1774 @@
+/*
+ * a timing utilities library
+ *
+ * Requires 64bit integers to work.
+ *
+ * %W% %@%
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994-1998 Larry McVoy.
+ * Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+#define	 _LIB /* bench.h needs this */
+#include "bench.h"
+
+/* #define _DEBUG */
+
+#define	nz(x)	((x) == 0 ? 1 : (x))
+
+/*
+ * I know you think these should be 2^10 and 2^20, but people are quoting
+ * disk sizes in powers of 10, and bandwidths are all power of ten.
+ * Deal with it.
+ */
+#define	MB	(1000*1000.0)
+#define	KB	(1000.0)
+
+static struct timeval 	start_tv, stop_tv;
+FILE			*ftiming;
+static volatile uint64	use_result_dummy;
+static		uint64	iterations;
+static		void	init_timing(void);
+
+#if defined(hpux) || defined(__hpux)
+#include <sys/mman.h>
+#endif
+
+#ifdef	RUSAGE
+#include <sys/resource.h>
+#define	SECS(tv)	(tv.tv_sec + tv.tv_usec / 1000000.0)
+#define	mine(f)		(int)(ru_stop.f - ru_start.f)
+
+static struct rusage ru_start, ru_stop;
+
+void
+rusage(void)
+{
+	double  sys, user, idle;
+	double  per;
+
+	sys = SECS(ru_stop.ru_stime) - SECS(ru_start.ru_stime);
+	user = SECS(ru_stop.ru_utime) - SECS(ru_start.ru_utime);
+	idle = timespent() - (sys + user);
+	per = idle / timespent() * 100;
+	if (!ftiming) ftiming = stderr;
+	fprintf(ftiming, "real=%.2f sys=%.2f user=%.2f idle=%.2f stall=%.0f%% ",
+	    timespent(), sys, user, idle, per);
+	fprintf(ftiming, "rd=%d wr=%d min=%d maj=%d ctx=%d\n",
+	    mine(ru_inblock), mine(ru_oublock),
+	    mine(ru_minflt), mine(ru_majflt),
+	    mine(ru_nvcsw) + mine(ru_nivcsw));
+}
+
+#endif	/* RUSAGE */
+
+void
+lmbench_usage(int argc, char *argv[], char* usage)
+{
+	fprintf(stderr,"Usage: %s %s", argv[0], usage);
+	exit(-1);
+}
+
+
+void
+sigchld_wait_handler(int signo)
+{
+	wait(0);
+	signal(SIGCHLD, sigchld_wait_handler);
+}
+
+static int	benchmp_sigterm_received;
+static int	benchmp_sigchld_received;
+static pid_t	benchmp_sigalrm_pid;
+static int	benchmp_sigalrm_timeout;
+void (*benchmp_sigterm_handler)(int);
+void (*benchmp_sigchld_handler)(int);
+void (*benchmp_sigalrm_handler)(int);
+
+void
+benchmp_sigterm(int signo)
+{
+	benchmp_sigterm_received = 1;
+}
+
+void
+benchmp_sigchld(int signo)
+{
+	signal(SIGCHLD, SIG_DFL);
+	benchmp_sigchld_received = 1;
+#ifdef _DEBUG
+	fprintf(stderr, "benchmp_sigchld handler\n");
+#endif
+}
+
+void
+benchmp_sigalrm(int signo)
+{
+	signal(SIGALRM, SIG_IGN);
+	kill(benchmp_sigalrm_pid, SIGTERM);
+	/* 
+	 * Since we already waited a full timeout period for the child
+	 * to die, we only need to wait a little longer for subsequent
+	 * children to die.
+	 */
+	benchmp_sigalrm_timeout = 1;
+}
+
+void 
+benchmp_child(benchmp_f initialize, 
+	      benchmp_f benchmark,
+	      benchmp_f cleanup,
+	      int childid,
+	      int response, 
+	      int start_signal, 
+	      int result_signal, 
+	      int exit_signal,
+	      int parallel, 
+	      iter_t iterations,
+	      int repetitions,
+	      int enough,
+	      void* cookie
+	      );
+void
+benchmp_parent(int response, 
+	       int start_signal, 
+	       int result_signal, 
+	       int exit_signal, 
+	       pid_t* pids,
+	       int parallel, 
+	       iter_t iterations,
+	       int warmup,
+	       int repetitions,
+	       int enough
+	       );
+
+int
+sizeof_result(int repetitions);
+
+void 
+benchmp(benchmp_f initialize, 
+	benchmp_f benchmark,
+	benchmp_f cleanup,
+	int enough, 
+	int parallel,
+	int warmup,
+	int repetitions,
+	void* cookie)
+{
+	iter_t		iterations = 1;
+	double		result = 0.;
+	double		usecs;
+	long		i, j;
+	pid_t		pid;
+	pid_t		*pids = NULL;
+	int		response[2];
+	int		start_signal[2];
+	int		result_signal[2];
+	int		exit_signal[2];
+	int		need_warmup;
+	fd_set		fds;
+	struct timeval	timeout;
+
+#ifdef _DEBUG
+	fprintf(stderr, "benchmp(%p, %p, %p, %d, %d, %d, %d, %p): entering\n", initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie);
+#endif
+	enough = get_enough(enough);
+#ifdef _DEBUG
+	fprintf(stderr, "\tenough=%d\n", enough);
+#endif
+
+	/* initialize results */
+	settime(0);
+	save_n(1);
+
+	if (parallel > 1) {
+		/* Compute the baseline performance */
+		benchmp(initialize, benchmark, cleanup, 
+			enough, 1, warmup, repetitions, cookie);
+
+		/* if we can't even do a single job, then give up */
+		if (gettime() == 0)
+			return;
+
+		/* calculate iterations for 1sec runtime */
+		iterations = get_n();
+		if (enough < SHORT) {
+			double tmp = (double)SHORT * (double)get_n();
+			tmp /= (double)gettime();
+			iterations = (iter_t)tmp + 1;
+		}
+		settime(0);
+		save_n(1);
+	}
+
+	/* Create the necessary pipes for control */
+	if (pipe(response) < 0
+	    || pipe(start_signal) < 0
+	    || pipe(result_signal) < 0
+	    || pipe(exit_signal) < 0) {
+#ifdef _DEBUG
+		fprintf(stderr, "BENCHMP: Could not create control pipes\n");
+#endif /* _DEBUG */
+		return;
+	}
+
+	/* fork the necessary children */
+	benchmp_sigchld_received = 0;
+	benchmp_sigterm_received = 0;
+	benchmp_sigterm_handler = signal(SIGTERM, benchmp_sigterm);
+	benchmp_sigchld_handler = signal(SIGCHLD, benchmp_sigchld);
+	pids = (pid_t*)malloc(parallel * sizeof(pid_t));
+	if (!pids) return;
+	bzero((void*)pids, parallel * sizeof(pid_t));
+
+	for (i = 0; i < parallel; ++i) {
+		if (benchmp_sigterm_received)
+			goto error_exit;
+#ifdef _DEBUG
+		fprintf(stderr, "benchmp(%p, %p, %p, %d, %d, %d, %d, %p): creating child %d\n", initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie, i);
+#endif
+		switch(pids[i] = fork()) {
+		case -1:
+			/* could not open enough children! */
+#ifdef _DEBUG
+			fprintf(stderr, "BENCHMP: fork() failed!\n");
+#endif /* _DEBUG */
+			goto error_exit;
+		case 0:
+			/* If child */
+			close(response[0]);
+			close(start_signal[1]);
+			close(result_signal[1]);
+			close(exit_signal[1]);
+			handle_scheduler(i, 0, 0);
+			benchmp_child(initialize, 
+				      benchmark, 
+				      cleanup, 
+				      i,
+				      response[1], 
+				      start_signal[0], 
+				      result_signal[0], 
+				      exit_signal[0],
+				      enough,
+				      iterations,
+				      parallel,
+				      repetitions,
+				      cookie
+				);
+			exit(0);
+		default:
+			break;
+		}
+	}
+	close(response[1]);
+	close(start_signal[0]);
+	close(result_signal[0]);
+	close(exit_signal[0]);
+	benchmp_parent(response[0], 
+		       start_signal[1], 
+		       result_signal[1], 
+		       exit_signal[1],
+		       pids,
+		       parallel, 
+		       iterations,
+		       warmup,
+		       repetitions,
+		       enough
+		);
+	goto cleanup_exit;
+
+error_exit:
+	/* give the children a chance to clean up gracefully */
+	signal(SIGCHLD, SIG_DFL);
+	while (--i >= 0) {
+		kill(pids[i], SIGTERM);
+		waitpid(pids[i], NULL, 0);
+	}
+
+cleanup_exit:
+	/* 
+	 * Clean up and kill all children
+	 *
+	 * NOTE: the children themselves SHOULD exit, and
+	 *   Killing them could prevent them from
+	 *   cleanup up subprocesses, etc... So, we only
+	 *   want to kill child processes when it appears
+	 *   that they will not die of their own accord.
+	 *   We wait twice the timing interval plus two seconds
+	 *   for children to die.  If they haven't died by 
+	 *   that time, then we start killing them.
+	 */
+	benchmp_sigalrm_timeout = (int)((2 * enough)/1000000) + 2;
+	if (benchmp_sigalrm_timeout < 5)
+		benchmp_sigalrm_timeout = 5;
+	signal(SIGCHLD, SIG_DFL);
+	while (i-- > 0) {
+		/* wait timeout seconds for child to die, then kill it */
+		benchmp_sigalrm_pid = pids[i];
+		benchmp_sigalrm_handler = signal(SIGALRM, benchmp_sigalrm);
+		alarm(benchmp_sigalrm_timeout); 
+
+		waitpid(pids[i], NULL, 0);
+
+		alarm(0);
+		signal(SIGALRM, benchmp_sigalrm_handler);
+	}
+
+	if (pids) free(pids);
+#ifdef _DEBUG
+	fprintf(stderr, "benchmp(0x%x, 0x%x, 0x%x, %d, %d, 0x%x): exiting\n", (unsigned int)initialize, (unsigned int)benchmark, (unsigned int)cleanup, enough, parallel, (unsigned int)cookie);
+#endif
+}
+
+void
+benchmp_parent(	int response, 
+		int start_signal, 
+		int result_signal, 
+		int exit_signal,
+		pid_t* pids,
+		int parallel, 
+	        iter_t iterations,
+		int warmup,
+		int repetitions,
+		int enough
+		)
+{
+	int		i,j,k,l;
+	int		bytes_read;
+	result_t*	results = NULL;
+	result_t*	merged_results = NULL;
+	char*		signals = NULL;
+	unsigned char*	buf;
+	fd_set		fds_read, fds_error;
+	struct timeval	timeout;
+
+	if (benchmp_sigchld_received || benchmp_sigterm_received) {
+#ifdef _DEBUG
+		fprintf(stderr, "benchmp_parent: entering, benchmp_sigchld_received=%d\n", benchmp_sigchld_received);
+#endif
+		goto error_exit;
+	}
+
+	results = (result_t*)malloc(sizeof_result(repetitions));
+	merged_results = (result_t*)malloc(sizeof_result(parallel * repetitions));
+	signals = (char*)malloc(parallel * sizeof(char));
+	if (!results || !merged_results || !signals) return;
+
+	/* Collect 'ready' signals */
+	for (i = 0; i < parallel * sizeof(char); i += bytes_read) {
+		bytes_read = 0;
+		FD_ZERO(&fds_read);
+		FD_ZERO(&fds_error);
+		FD_SET(response, &fds_read);
+		FD_SET(response, &fds_error);
+
+		timeout.tv_sec = 1;
+		timeout.tv_usec = 0;
+		select(response+1, &fds_read, NULL, &fds_error, &timeout);
+		if (benchmp_sigchld_received 
+		    || benchmp_sigterm_received
+		    || FD_ISSET(response, &fds_error)) 
+		{
+#ifdef _DEBUG
+			fprintf(stderr, "benchmp_parent: ready, benchmp_sigchld_received=%d\n", benchmp_sigchld_received);
+#endif
+			goto error_exit;
+		}
+		if (!FD_ISSET(response, &fds_read)) {
+			continue;
+		}
+
+		bytes_read = read(response, signals, parallel * sizeof(char) - i);
+		if (bytes_read < 0) {
+#ifdef _DEBUG
+			fprintf(stderr, "benchmp_parent: ready, bytes_read=%d, %s\n", bytes_read, strerror(errno));
+#endif
+			goto error_exit;
+		}
+	}
+
+	/* let the children run for warmup microseconds */
+	if (warmup > 0) {
+		struct timeval delay;
+		delay.tv_sec = warmup / 1000000;
+		delay.tv_usec = warmup % 1000000;
+
+		select(0, NULL, NULL, NULL, &delay);
+	}
+
+	/* send 'start' signal */
+	write(start_signal, signals, parallel * sizeof(char));
+
+	/* Collect 'done' signals */
+	for (i = 0; i < parallel * sizeof(char); i += bytes_read) {
+		bytes_read = 0;
+		FD_ZERO(&fds_read);
+		FD_ZERO(&fds_error);
+		FD_SET(response, &fds_read);
+		FD_SET(response, &fds_error);
+
+		timeout.tv_sec = 1;
+		timeout.tv_usec = 0;
+		select(response+1, &fds_read, NULL, &fds_error, &timeout);
+		if (benchmp_sigchld_received 
+		    || benchmp_sigterm_received
+		    || FD_ISSET(response, &fds_error)) 
+		{
+#ifdef _DEBUG
+			fprintf(stderr, "benchmp_parent: done, benchmp_child_died=%d\n", benchmp_sigchld_received);
+#endif
+			goto error_exit;
+		}
+		if (!FD_ISSET(response, &fds_read)) {
+			continue;
+		}
+
+		bytes_read = read(response, signals, parallel * sizeof(char) - i);
+		if (bytes_read < 0) {
+#ifdef _DEBUG
+			fprintf(stderr, "benchmp_parent: done, bytes_read=%d, %s\n", bytes_read, strerror(errno));
+#endif
+			goto error_exit;
+		}
+	}
+
+	/* collect results */
+	insertinit(merged_results);
+	for (i = 0; i < parallel; ++i) {
+		int n = sizeof_result(repetitions);
+		buf = (unsigned char*)results;
+
+		FD_ZERO(&fds_read);
+		FD_ZERO(&fds_error);
+
+		/* tell one child to report its results */
+		write(result_signal, buf, sizeof(char));
+
+		for (; n > 0; n -= bytes_read, buf += bytes_read) {
+			bytes_read = 0;
+			FD_SET(response, &fds_read);
+			FD_SET(response, &fds_error);
+
+			timeout.tv_sec = 1;
+			timeout.tv_usec = 0;
+			select(response+1, &fds_read, NULL, &fds_error, &timeout);
+			if (benchmp_sigchld_received 
+			    || benchmp_sigterm_received
+			    || FD_ISSET(response, &fds_error)) 
+			{
+#ifdef _DEBUG
+				fprintf(stderr, "benchmp_parent: results, benchmp_sigchld_received=%d\n", benchmp_sigchld_received);
+#endif
+				goto error_exit;
+			}
+			if (!FD_ISSET(response, &fds_read)) {
+				continue;
+			}
+
+			bytes_read = read(response, buf, n);
+			if (bytes_read < 0) {
+#ifdef _DEBUG
+				fprintf(stderr, "benchmp_parent: results, bytes_read=%d, %s\n", bytes_read, strerror(errno));
+#endif
+				goto error_exit;
+			}
+		}
+		for (j = 0; j < results->N; ++j) {
+			insertsort(results->v[j].u, 
+				   results->v[j].n, merged_results);
+		}
+	}
+
+	/* we allow children to die now, without it causing an error */
+	signal(SIGCHLD, SIG_DFL);
+	
+	/* send 'exit' signals */
+	write(exit_signal, results, parallel * sizeof(char));
+
+	/* Compute median time; iterations is constant! */
+	set_results(merged_results);
+
+	goto cleanup_exit;
+error_exit:
+#ifdef _DEBUG
+	fprintf(stderr, "benchmp_parent: error_exit!\n");
+#endif
+	signal(SIGCHLD, SIG_DFL);
+	for (i = 0; i < parallel; ++i) {
+		kill(pids[i], SIGTERM);
+		waitpid(pids[i], NULL, 0);
+	}
+	free(merged_results);
+cleanup_exit:
+	close(response);
+	close(start_signal);
+	close(result_signal);
+	close(exit_signal);
+
+	if (results) free(results);
+	if (signals) free(signals);
+}
+
+
+typedef enum { warmup, timing_interval, cooldown } benchmp_state;
+
+typedef struct {
+	benchmp_state	state;
+	benchmp_f	initialize;
+	benchmp_f	benchmark;
+	benchmp_f	cleanup;
+	int		childid;
+	int		response;
+	int		start_signal;
+	int		result_signal;
+	int		exit_signal;
+	int		enough;
+        iter_t		iterations;
+	int		parallel;
+        int		repetitions;
+	void*		cookie;
+	iter_t		iterations_batch;
+	int		need_warmup;
+	long		i;
+	int		r_size;
+	result_t*	r;
+} benchmp_child_state;
+
+static benchmp_child_state _benchmp_child_state;
+
+int
+benchmp_childid()
+{
+	return _benchmp_child_state.childid;
+}
+
+void
+benchmp_child_sigchld(int signo)
+{
+#ifdef _DEBUG
+	fprintf(stderr, "benchmp_child_sigchld handler\n");
+#endif
+	if (_benchmp_child_state.cleanup) {
+		signal(SIGCHLD, SIG_DFL);
+		(*_benchmp_child_state.cleanup)(0, &_benchmp_child_state);
+	}
+	exit(1);
+}
+
+void
+benchmp_child_sigterm(int signo)
+{
+	signal(SIGTERM, SIG_IGN);
+	if (_benchmp_child_state.cleanup) {
+		void (*sig)(int) = signal(SIGCHLD, SIG_DFL);
+		if (sig != benchmp_child_sigchld && sig != SIG_DFL) {
+			signal(SIGCHLD, sig);
+		}
+		(*_benchmp_child_state.cleanup)(0, &_benchmp_child_state);
+	}
+	exit(0);
+}
+
+void*
+benchmp_getstate()
+{
+	return ((void*)&_benchmp_child_state);
+}
+
+void 
+benchmp_child(benchmp_f initialize, 
+		benchmp_f benchmark,
+		benchmp_f cleanup,
+		int childid,
+		int response, 
+		int start_signal, 
+		int result_signal, 
+		int exit_signal,
+		int enough,
+	        iter_t iterations,
+		int parallel, 
+	        int repetitions,
+		void* cookie
+		)
+{
+	iter_t		iterations_batch = (parallel > 1) ? get_n() : 1;
+	double		result = 0.;
+	double		usecs;
+	long		i = 0;
+	int		need_warmup;
+	fd_set		fds;
+	struct timeval	timeout;
+
+	_benchmp_child_state.state = warmup;
+	_benchmp_child_state.initialize = initialize;
+	_benchmp_child_state.benchmark = benchmark;
+	_benchmp_child_state.cleanup = cleanup;
+	_benchmp_child_state.childid = childid;
+	_benchmp_child_state.response = response;
+	_benchmp_child_state.start_signal = start_signal;
+	_benchmp_child_state.result_signal = result_signal;
+	_benchmp_child_state.exit_signal = exit_signal;
+	_benchmp_child_state.enough = enough;
+	_benchmp_child_state.iterations = iterations;
+	_benchmp_child_state.iterations_batch = iterations_batch;
+	_benchmp_child_state.parallel = parallel;
+	_benchmp_child_state.repetitions = repetitions;
+	_benchmp_child_state.cookie = cookie;
+	_benchmp_child_state.need_warmup = 1;
+	_benchmp_child_state.i = 0;
+	_benchmp_child_state.r_size = sizeof_result(repetitions);
+	_benchmp_child_state.r = (result_t*)malloc(_benchmp_child_state.r_size);
+
+	if (!_benchmp_child_state.r) return;
+	insertinit(_benchmp_child_state.r);
+	set_results(_benchmp_child_state.r);
+
+	need_warmup = 1;
+	timeout.tv_sec = 0;
+	timeout.tv_usec = 0;
+
+	if (benchmp_sigchld_handler != SIG_DFL) {
+		signal(SIGCHLD, benchmp_sigchld_handler);
+	} else {
+		signal(SIGCHLD, benchmp_child_sigchld);
+	}
+
+	if (initialize)
+		(*initialize)(0, cookie);
+	
+	if (benchmp_sigterm_handler != SIG_DFL) {
+		signal(SIGTERM, benchmp_sigterm_handler);
+	} else {
+		signal(SIGTERM, benchmp_child_sigterm);
+	}
+	if (benchmp_sigterm_received)
+		benchmp_child_sigterm(SIGTERM);
+
+	/* start experiments, collecting results */
+	insertinit(_benchmp_child_state.r);
+
+	while (1) {
+		(*benchmark)(benchmp_interval(&_benchmp_child_state), cookie);
+	}
+}
+
+iter_t
+benchmp_interval(void* _state)
+{
+	char		c;
+	iter_t		iterations;
+	double		result;
+	fd_set		fds;
+	struct timeval	timeout;
+	benchmp_child_state* state = (benchmp_child_state*)_state;
+
+	iterations = (state->state == timing_interval ? state->iterations : state->iterations_batch);
+
+	if (!state->need_warmup) {
+		result = stop(0,0);
+		if (state->cleanup) {
+			if (benchmp_sigchld_handler == SIG_DFL)
+				signal(SIGCHLD, SIG_DFL);
+			(*state->cleanup)(iterations, state->cookie);
+		}
+		save_n(state->iterations);
+		result -= t_overhead() + get_n() * l_overhead();
+		settime(result >= 0. ? (uint64)result : 0.);
+	}
+
+	/* if the parent died, then give up */
+	if (getppid() == 1 && state->cleanup) {
+		if (benchmp_sigchld_handler == SIG_DFL)
+			signal(SIGCHLD, SIG_DFL);
+		(*state->cleanup)(0, state->cookie);
+		exit(0);
+	}
+
+	timeout.tv_sec = 0;
+	timeout.tv_usec = 0;
+	FD_ZERO(&fds);
+
+	switch (state->state) {
+	case warmup:
+		iterations = state->iterations_batch;
+		FD_SET(state->start_signal, &fds);
+		select(state->start_signal+1, &fds, NULL,
+		       NULL, &timeout);
+		if (FD_ISSET(state->start_signal, &fds)) {
+			state->state = timing_interval;
+			read(state->start_signal, &c, sizeof(char));
+			iterations = state->iterations;
+		}
+		if (state->need_warmup) {
+			state->need_warmup = 0;
+			/* send 'ready' */
+			write(state->response, &c, sizeof(char));
+		}
+		break;
+	case timing_interval:
+		iterations = state->iterations;
+		if (state->parallel > 1 || result > 0.95 * state->enough) {
+			insertsort(gettime(), get_n(), get_results());
+			state->i++;
+			/* we completed all the experiments, return results */
+			if (state->i >= state->repetitions) {
+				state->state = cooldown;
+			}
+		}
+		if (state->parallel == 1 
+		    && (result < 0.99 * state->enough || result > 1.2 * state->enough)) {
+			if (result > 150.) {
+				double tmp = iterations / result;
+				tmp *= 1.1 * state->enough;
+				iterations = (iter_t)(tmp + 1);
+			} else {
+				iterations <<= 3;
+				if (iterations > 1<<27
+				    || result < 0. && iterations > 1<<20) {
+					state->state = cooldown;
+				}
+			}
+		}
+		state->iterations = iterations;
+		if (state->state == cooldown) {
+			/* send 'done' */
+			write(state->response, (void*)&c, sizeof(char));
+			iterations = state->iterations_batch;
+		}
+		break;
+	case cooldown:
+		iterations = state->iterations_batch;
+		FD_SET(state->result_signal, &fds);
+		select(state->result_signal+1, &fds, NULL, NULL, &timeout);
+		if (FD_ISSET(state->result_signal, &fds)) {
+			/* 
+			 * At this point all children have stopped their
+			 * measurement loops, so we can block waiting for
+			 * the parent to tell us to send our results back.
+			 * From this point on, we will do no more "work".
+			 */
+			read(state->result_signal, (void*)&c, sizeof(char));
+			write(state->response, (void*)get_results(), state->r_size);
+			if (state->cleanup) {
+				if (benchmp_sigchld_handler == SIG_DFL)
+					signal(SIGCHLD, SIG_DFL);
+				(*state->cleanup)(0, state->cookie);
+			}
+
+			/* Now wait for signal to exit */
+			read(state->exit_signal, (void*)&c, sizeof(char));
+			exit(0);
+		}
+	};
+	if (state->initialize) {
+		(*state->initialize)(iterations, state->cookie);
+	}
+	start(0);
+	return (iterations);
+}
+
+
+/*
+ * Redirect output someplace else.
+ */
+void
+timing(FILE *out)
+{
+	ftiming = out;
+}
+
+/*
+ * Start timing now.
+ */
+void
+start(struct timeval *tv)
+{
+	if (tv == NULL) {
+		tv = &start_tv;
+	}
+#ifdef	RUSAGE
+	getrusage(RUSAGE_SELF, &ru_start);
+#endif
+	(void) gettimeofday(tv, (struct timezone *) 0);
+}
+
+/*
+ * Stop timing and return real time in microseconds.
+ */
+uint64
+stop(struct timeval *begin, struct timeval *end)
+{
+	if (end == NULL) {
+		end = &stop_tv;
+	}
+	(void) gettimeofday(end, (struct timezone *) 0);
+#ifdef	RUSAGE
+	getrusage(RUSAGE_SELF, &ru_stop);
+#endif
+
+	if (begin == NULL) {
+		begin = &start_tv;
+	}
+	return (tvdelta(begin, end));
+}
+
+uint64
+now(void)
+{
+	struct timeval t;
+	uint64	m;
+
+	(void) gettimeofday(&t, (struct timezone *) 0);
+	m = t.tv_sec;
+	m *= 1000000;
+	m += t.tv_usec;
+	return (m);
+}
+
+double
+Now(void)
+{
+	struct timeval t;
+
+	(void) gettimeofday(&t, (struct timezone *) 0);
+	return (t.tv_sec * 1000000.0 + t.tv_usec);
+}
+
+uint64
+delta(void)
+{
+	static struct timeval last;
+	struct timeval t;
+	struct timeval diff;
+	uint64	m;
+
+	(void) gettimeofday(&t, (struct timezone *) 0);
+	if (last.tv_usec) {
+		tvsub(&diff, &t, &last);
+		last = t;
+		m = diff.tv_sec;
+		m *= 1000000;
+		m += diff.tv_usec;
+		return (m);
+	} else {
+		last = t;
+		return (0);
+	}
+}
+
+double
+Delta(void)
+{
+	struct timeval t;
+	struct timeval diff;
+
+	(void) gettimeofday(&t, (struct timezone *) 0);
+	tvsub(&diff, &t, &start_tv);
+	return (diff.tv_sec + diff.tv_usec / 1000000.0);
+}
+
+void
+save_n(uint64 n)
+{
+	iterations = n;
+}
+
+uint64
+get_n(void)
+{
+	return (iterations);
+}
+
+/*
+ * Make the time spend be usecs.
+ */
+void
+settime(uint64 usecs)
+{
+	bzero((void*)&start_tv, sizeof(start_tv));
+	stop_tv.tv_sec = usecs / 1000000;
+	stop_tv.tv_usec = usecs % 1000000;
+}
+
+void
+bandwidth(uint64 bytes, uint64 times, int verbose)
+{
+	struct timeval tdiff;
+	double  mb, secs;
+
+	tvsub(&tdiff, &stop_tv, &start_tv);
+	secs = tdiff.tv_sec;
+	secs *= 1000000;
+	secs += tdiff.tv_usec;
+	secs /= 1000000;
+	secs /= times;
+	mb = bytes / MB;
+	if (!ftiming) ftiming = stderr;
+	if (verbose) {
+		(void) fprintf(ftiming,
+		    "%.4f MB in %.4f secs, %.4f MB/sec\n",
+		    mb, secs, mb/secs);
+	} else {
+		if (mb < 1) {
+			(void) fprintf(ftiming, "%.6f ", mb);
+		} else {
+			(void) fprintf(ftiming, "%.2f ", mb);
+		}
+		if (mb / secs < 1) {
+			(void) fprintf(ftiming, "%.6f\n", mb/secs);
+		} else {
+			(void) fprintf(ftiming, "%.2f\n", mb/secs);
+		}
+	}
+}
+
+void
+kb(uint64 bytes)
+{
+	struct timeval td;
+	double  s, bs;
+
+	tvsub(&td, &stop_tv, &start_tv);
+	s = td.tv_sec + td.tv_usec / 1000000.0;
+	bs = bytes / nz(s);
+	if (s == 0.0) return;
+	if (!ftiming) ftiming = stderr;
+	(void) fprintf(ftiming, "%.0f KB/sec\n", bs / KB);
+}
+
+void
+mb(uint64 bytes)
+{
+	struct timeval td;
+	double  s, bs;
+
+	tvsub(&td, &stop_tv, &start_tv);
+	s = td.tv_sec + td.tv_usec / 1000000.0;
+	bs = bytes / nz(s);
+	if (s == 0.0) return;
+	if (!ftiming) ftiming = stderr;
+	(void) fprintf(ftiming, "%.2f MB/sec\n", bs / MB);
+}
+
+void
+latency(uint64 xfers, uint64 size)
+{
+	struct timeval td;
+	double  s;
+
+	if (!ftiming) ftiming = stderr;
+	tvsub(&td, &stop_tv, &start_tv);
+	s = td.tv_sec + td.tv_usec / 1000000.0;
+	if (s == 0.0) return;
+	if (xfers > 1) {
+		fprintf(ftiming, "%d %dKB xfers in %.2f secs, ",
+		    (int) xfers, (int) (size / KB), s);
+	} else {
+		fprintf(ftiming, "%.1fKB in ", size / KB);
+	}
+	if ((s * 1000 / xfers) > 100) {
+		fprintf(ftiming, "%.0f millisec%s, ",
+		    s * 1000 / xfers, xfers > 1 ? "/xfer" : "s");
+	} else {
+		fprintf(ftiming, "%.4f millisec%s, ",
+		    s * 1000 / xfers, xfers > 1 ? "/xfer" : "s");
+	}
+	if (((xfers * size) / (MB * s)) > 1) {
+		fprintf(ftiming, "%.2f MB/sec\n", (xfers * size) / (MB * s));
+	} else {
+		fprintf(ftiming, "%.2f KB/sec\n", (xfers * size) / (KB * s));
+	}
+}
+
+void
+context(uint64 xfers)
+{
+	struct timeval td;
+	double  s;
+
+	tvsub(&td, &stop_tv, &start_tv);
+	s = td.tv_sec + td.tv_usec / 1000000.0;
+	if (s == 0.0) return;
+	if (!ftiming) ftiming = stderr;
+	fprintf(ftiming,
+	    "%d context switches in %.2f secs, %.0f microsec/switch\n",
+	    (int)xfers, s, s * 1000000 / xfers);
+}
+
+void
+nano(char *s, uint64 n)
+{
+	struct timeval td;
+	double  micro;
+
+	tvsub(&td, &stop_tv, &start_tv);
+	micro = td.tv_sec * 1000000 + td.tv_usec;
+	micro *= 1000;
+	if (micro == 0.0) return;
+	if (!ftiming) ftiming = stderr;
+	fprintf(ftiming, "%s: %.2f nanoseconds\n", s, micro / n);
+}
+
+void
+micro(char *s, uint64 n)
+{
+	struct timeval td;
+	double	micro;
+
+	tvsub(&td, &stop_tv, &start_tv);
+	micro = td.tv_sec * 1000000 + td.tv_usec;
+	micro /= n;
+	if (micro == 0.0) return;
+	if (!ftiming) ftiming = stderr;
+	fprintf(ftiming, "%s: %.4f microseconds\n", s, micro);
+#if 0
+	if (micro >= 100) {
+		fprintf(ftiming, "%s: %.1f microseconds\n", s, micro);
+	} else if (micro >= 10) {
+		fprintf(ftiming, "%s: %.3f microseconds\n", s, micro);
+	} else {
+		fprintf(ftiming, "%s: %.4f microseconds\n", s, micro);
+	}
+#endif
+}
+
+void
+micromb(uint64 sz, uint64 n)
+{
+	struct timeval td;
+	double	mb, micro;
+
+	tvsub(&td, &stop_tv, &start_tv);
+	micro = td.tv_sec * 1000000 + td.tv_usec;
+	micro /= n;
+	mb = sz;
+	mb /= MB;
+	if (micro == 0.0) return;
+	if (!ftiming) ftiming = stderr;
+	if (micro >= 10) {
+		fprintf(ftiming, "%.6f %.0f\n", mb, micro);
+	} else {
+		fprintf(ftiming, "%.6f %.3f\n", mb, micro);
+	}
+}
+
+void
+milli(char *s, uint64 n)
+{
+	struct timeval td;
+	uint64 milli;
+
+	tvsub(&td, &stop_tv, &start_tv);
+	milli = td.tv_sec * 1000 + td.tv_usec / 1000;
+	milli /= n;
+	if (milli == 0.0) return;
+	if (!ftiming) ftiming = stderr;
+	fprintf(ftiming, "%s: %d milliseconds\n", s, (int)milli);
+}
+
+void
+ptime(uint64 n)
+{
+	struct timeval td;
+	double  s;
+
+	tvsub(&td, &stop_tv, &start_tv);
+	s = td.tv_sec + td.tv_usec / 1000000.0;
+	if (s == 0.0) return;
+	if (!ftiming) ftiming = stderr;
+	fprintf(ftiming,
+	    "%d in %.2f secs, %.0f microseconds each\n",
+	    (int)n, s, s * 1000000 / n);
+}
+
+uint64
+tvdelta(struct timeval *start, struct timeval *stop)
+{
+	struct timeval td;
+	uint64	usecs;
+
+	tvsub(&td, stop, start);
+	usecs = td.tv_sec;
+	usecs *= 1000000;
+	usecs += td.tv_usec;
+	return (usecs);
+}
+
+void
+tvsub(struct timeval * tdiff, struct timeval * t1, struct timeval * t0)
+{
+	tdiff->tv_sec = t1->tv_sec - t0->tv_sec;
+	tdiff->tv_usec = t1->tv_usec - t0->tv_usec;
+	if (tdiff->tv_usec < 0 && tdiff->tv_sec > 0) {
+		tdiff->tv_sec--;
+		tdiff->tv_usec += 1000000;
+		assert(tdiff->tv_usec >= 0);
+	}
+
+	/* time shouldn't go backwards!!! */
+	if (tdiff->tv_usec < 0 || t1->tv_sec < t0->tv_sec) {
+		tdiff->tv_sec = 0;
+		tdiff->tv_usec = 0;
+	}
+}
+
+uint64
+gettime(void)
+{
+	return (tvdelta(&start_tv, &stop_tv));
+}
+
+double
+timespent(void)
+{
+	struct timeval td;
+
+	tvsub(&td, &stop_tv, &start_tv);
+	return (td.tv_sec + td.tv_usec / 1000000.0);
+}
+
+static	char	p64buf[10][20];
+static	int	n;
+
+char	*
+p64(uint64 big)
+{
+	char	*s = p64buf[n++];
+
+	if (n == 10) n = 0;
+#ifdef  linux
+	{
+        int     *a = (int*)&big;
+
+        if (a[1]) {
+                sprintf(s, "0x%x%08x", a[1], a[0]);
+        } else {
+                sprintf(s, "0x%x", a[0]);
+        }
+	}
+#endif
+#ifdef	__sgi
+        sprintf(s, "0x%llx", big);
+#endif
+	return (s);
+}
+
+char	*
+p64sz(uint64 big)
+{
+	double	d = big;
+	char	*tags = " KMGTPE";
+	int	t = 0;
+	char	*s = p64buf[n++];
+
+	if (n == 10) n = 0;
+	while (d > 512) t++, d /= 1024;
+	if (d == 0) {
+		return ("0");
+	}
+	if (d < 100) {
+		sprintf(s, "%.4f%c", d, tags[t]);
+	} else {
+		sprintf(s, "%.2f%c", d, tags[t]);
+	}
+	return (s);
+}
+
+char
+last(char *s)
+{
+	while (*s++)
+		;
+	return (s[-2]);
+}
+
+uint64
+bytes(char *s)
+{
+	uint64	n;
+
+	if (sscanf(s, "%llu", &n) < 1)
+		return (0);
+
+	if ((last(s) == 'k') || (last(s) == 'K'))
+		n *= 1024;
+	if ((last(s) == 'm') || (last(s) == 'M'))
+		n *= (1024 * 1024);
+	return (n);
+}
+
+void
+use_int(int result) { use_result_dummy += result; }
+
+void
+use_pointer(void *result) { use_result_dummy += (long)result; }
+
+int
+sizeof_result(int repetitions)
+{
+	if (repetitions <= TRIES)
+		return (sizeof(result_t));
+	return (sizeof(result_t) + (repetitions - TRIES) * sizeof(value_t));
+}
+
+void
+insertinit(result_t *r)
+{
+	int	i;
+
+	r->N = 0;
+}
+
+/* biggest to smallest */
+void
+insertsort(uint64 u, uint64 n, result_t *r)
+{
+	int	i, j;
+
+	if (u == 0) return;
+
+#ifdef _DEBUG
+	fprintf(stderr, "\tinsertsort(%llu, %llu, %p)\n", u, n, r);
+#endif /* _DEBUG */
+	for (i = 0; i < r->N; ++i) {
+		if (u/(double)n > r->v[i].u/(double)r->v[i].n) {
+			for (j = r->N; j > i; --j) {
+				r->v[j] = r->v[j - 1];
+			}
+			break;
+		}
+	}
+	r->v[i].u = u;
+	r->v[i].n = n;
+	r->N++;
+}
+
+static result_t  _results;
+static result_t* results = &_results;
+
+result_t*
+get_results()
+{
+	return (results);
+}
+
+void
+set_results(result_t *r)
+{
+	results = r;
+	save_median();
+}
+
+void
+save_minimum()
+{
+	if (results->N == 0) {
+		save_n(1);
+		settime(0);
+	} else {
+		save_n(results->v[results->N - 1].n);
+		settime(results->v[results->N - 1].u);
+	}
+}
+
+void
+save_median()
+{
+	int	i = results->N / 2;
+	uint64	u, n;
+
+	if (results->N == 0) {
+		n = 1;
+		u = 0;
+	} else if (results->N % 2) {
+		n = results->v[i].n;
+		u = results->v[i].u;
+	} else {
+		n = (results->v[i].n + results->v[i-1].n) / 2;
+		u = (results->v[i].u + results->v[i-1].u) / 2;
+	}
+#ifdef _DEBUG
+	fprintf(stderr, "save_median: N=%d, n=%lu, u=%lu\n", results->N, (unsigned long)n, (unsigned long)u);
+#endif /* _DEBUG */
+	save_n(n); settime(u);
+}
+
+/*
+ * The inner loop tracks bench.h but uses a different results array.
+ */
+static long *
+one_op(register long *p)
+{
+	BENCH_INNER(p = (long *)*p;, 0);
+	return (p);
+}
+
+static long *
+two_op(register long *p)
+{
+	BENCH_INNER(p = (long *)*p; p = (long*)*p;, 0);
+	return (p);
+}
+
+static long	*p = (long *)&p;
+static long	*q = (long *)&q;
+
+double
+l_overhead(void)
+{
+	int	i;
+	uint64	N_save, u_save;
+	static	double overhead;
+	static	int initialized = 0;
+	result_t one, two, *r_save;
+
+	init_timing();
+	if (initialized) return (overhead);
+
+	initialized = 1;
+	if (getenv("LOOP_O")) {
+		overhead = atof(getenv("LOOP_O"));
+	} else {
+		r_save = get_results(); N_save = get_n(); u_save = gettime(); 
+		insertinit(&one);
+		insertinit(&two);
+		for (i = 0; i < TRIES; ++i) {
+			use_pointer((void*)one_op(p));
+			if (gettime() > t_overhead())
+				insertsort(gettime() - t_overhead(), get_n(), &one);
+			use_pointer((void *)two_op(p));
+			if (gettime() > t_overhead())
+				insertsort(gettime() - t_overhead(), get_n(), &two);
+		}
+		/*
+		 * u1 = (n1 * (overhead + work))
+		 * u2 = (n2 * (overhead + 2 * work))
+		 * ==> overhead = 2. * u1 / n1 - u2 / n2
+		 */
+		set_results(&one); 
+		save_minimum();
+		overhead = 2. * gettime() / (double)get_n();
+		
+		set_results(&two); 
+		save_minimum();
+		overhead -= gettime() / (double)get_n();
+		
+		if (overhead < 0.) overhead = 0.;	/* Gag */
+
+		set_results(r_save); save_n(N_save); settime(u_save); 
+	}
+	return (overhead);
+}
+
+/*
+ * Figure out the timing overhead.  This has to track bench.h
+ */
+uint64
+t_overhead(void)
+{
+	uint64		N_save, u_save;
+	static int	initialized = 0;
+	static uint64	overhead = 0;
+	struct timeval	tv;
+	result_t	*r_save;
+
+	init_timing();
+	if (initialized) return (overhead);
+
+	initialized = 1;
+	if (getenv("TIMING_O")) {
+		overhead = atof(getenv("TIMING_O"));
+	} else if (get_enough(0) <= 50000) {
+		/* it is not in the noise, so compute it */
+		int		i;
+		result_t	r;
+
+		r_save = get_results(); N_save = get_n(); u_save = gettime(); 
+		insertinit(&r);
+		for (i = 0; i < TRIES; ++i) {
+			BENCH_INNER(gettimeofday(&tv, 0), 0);
+			insertsort(gettime(), get_n(), &r);
+		}
+		set_results(&r);
+		save_minimum();
+		overhead = gettime() / get_n();
+
+		set_results(r_save); save_n(N_save); settime(u_save); 
+	}
+	return (overhead);
+}
+
+/*
+ * Figure out how long to run it.
+ * If enough == 0, then they want us to figure it out.
+ * If enough is !0 then return it unless we think it is too short.
+ */
+static	int	long_enough;
+static	int	compute_enough();
+
+int
+get_enough(int e)
+{
+	init_timing();
+	return (long_enough > e ? long_enough : e);
+}
+
+
+static void
+init_timing(void)
+{
+	static	int done = 0;
+
+	if (done) return;
+	done = 1;
+	long_enough = compute_enough();
+	t_overhead();
+	l_overhead();
+}
+
+typedef long TYPE;
+
+static TYPE **
+enough_duration(register long N, register TYPE ** p)
+{
+#define	ENOUGH_DURATION_TEN(one)	one one one one one one one one one one
+	while (N-- > 0) {
+		ENOUGH_DURATION_TEN(p = (TYPE **) *p;);
+	}
+	return (p);
+}
+
+static uint64
+duration(long N)
+{
+	uint64	usecs;
+	TYPE   *x = (TYPE *)&x;
+	TYPE  **p = (TYPE **)&x;
+
+	start(0);
+	p = enough_duration(N, p);
+	usecs = stop(0, 0);
+	use_pointer((void *)p);
+	return (usecs);
+}
+
+/*
+ * find the minimum time that work "N" takes in "tries" tests
+ */
+static uint64
+time_N(iter_t N)
+{
+	int     i;
+	uint64	usecs;
+	result_t r, *r_save;
+
+	r_save = get_results();
+	insertinit(&r);
+	for (i = 1; i < TRIES; ++i) {
+		usecs = duration(N);
+		insertsort(usecs, N, &r);
+	}
+	set_results(&r);
+	save_minimum();
+	usecs = gettime();
+	set_results(r_save);
+	return (usecs);
+}
+
+/*
+ * return the amount of work needed to run "enough" microseconds
+ */
+static iter_t
+find_N(int enough)
+{
+	int		tries;
+	static iter_t	N = 10000;
+	static uint64	usecs = 0;
+
+	if (!usecs) usecs = time_N(N);
+
+	for (tries = 0; tries < 10; ++tries) {
+		if (0.98 * enough < usecs && usecs < 1.02 * enough)
+			return (N);
+		if (usecs < 1000)
+			N *= 10;
+		else {
+			double  n = N;
+
+			n /= usecs;
+			n *= enough;
+			N = n + 1;
+		}
+		usecs = time_N(N);
+	}
+	return (0);
+}
+
+/*
+ * We want to verify that small modifications proportionally affect the runtime
+ */
+static double test_points[] = {1.015, 1.02, 1.035};
+static int
+test_time(int enough)
+{
+	int     i;
+	iter_t	N;
+	uint64	usecs, expected, baseline, diff;
+
+	if ((N = find_N(enough)) == 0)
+		return (0);
+
+	baseline = time_N(N);
+
+	for (i = 0; i < sizeof(test_points) / sizeof(double); ++i) {
+		usecs = time_N((int)((double) N * test_points[i]));
+		expected = (uint64)((double)baseline * test_points[i]);
+		diff = expected > usecs ? expected - usecs : usecs - expected;
+		if (diff / (double)expected > 0.0025)
+			return (0);
+	}
+	return (1);
+}
+
+
+/*
+ * We want to find the smallest timing interval that has accurate timing
+ */
+static int     possibilities[] = { 5000, 10000, 50000, 100000 };
+static int
+compute_enough()
+{
+	int     i;
+
+	if (getenv("ENOUGH")) {
+		return (atoi(getenv("ENOUGH")));
+	}
+	for (i = 0; i < sizeof(possibilities) / sizeof(int); ++i) {
+		if (test_time(possibilities[i]))
+			return (possibilities[i]);
+	}
+
+	/* 
+	 * if we can't find a timing interval that is sufficient, 
+	 * then use SHORT as a default.
+	 */
+	return (SHORT);
+}
+
+/*
+ * This stuff isn't really lib_timing, but ...
+ */
+void
+morefds(void)
+{
+#ifdef	RLIMIT_NOFILE
+	struct	rlimit r;
+
+	getrlimit(RLIMIT_NOFILE, &r);
+	r.rlim_cur = r.rlim_max;
+	setrlimit(RLIMIT_NOFILE, &r);
+#endif
+}
+
+/* analogous to bzero, bcopy, etc., except that it just reads
+ * data into the processor
+ */
+long
+bread(void* buf, long nbytes)
+{
+	long sum = 0;
+	register long *p, *next;
+	register char *end;
+
+	p = (long*)buf;
+	end = (char*)buf + nbytes;
+	for (next = p + 128; (void*)next <= (void*)end; p = next, next += 128) {
+		sum +=
+			p[0]+p[1]+p[2]+p[3]+p[4]+p[5]+p[6]+p[7]+
+			p[8]+p[9]+p[10]+p[11]+p[12]+p[13]+p[14]+
+			p[15]+p[16]+p[17]+p[18]+p[19]+p[20]+p[21]+
+			p[22]+p[23]+p[24]+p[25]+p[26]+p[27]+p[28]+
+			p[29]+p[30]+p[31]+p[32]+p[33]+p[34]+p[35]+
+			p[36]+p[37]+p[38]+p[39]+p[40]+p[41]+p[42]+
+			p[43]+p[44]+p[45]+p[46]+p[47]+p[48]+p[49]+
+			p[50]+p[51]+p[52]+p[53]+p[54]+p[55]+p[56]+
+			p[57]+p[58]+p[59]+p[60]+p[61]+p[62]+p[63]+
+			p[64]+p[65]+p[66]+p[67]+p[68]+p[69]+p[70]+
+			p[71]+p[72]+p[73]+p[74]+p[75]+p[76]+p[77]+
+			p[78]+p[79]+p[80]+p[81]+p[82]+p[83]+p[84]+
+			p[85]+p[86]+p[87]+p[88]+p[89]+p[90]+p[91]+
+			p[92]+p[93]+p[94]+p[95]+p[96]+p[97]+p[98]+
+			p[99]+p[100]+p[101]+p[102]+p[103]+p[104]+
+			p[105]+p[106]+p[107]+p[108]+p[109]+p[110]+
+			p[111]+p[112]+p[113]+p[114]+p[115]+p[116]+
+			p[117]+p[118]+p[119]+p[120]+p[121]+p[122]+
+			p[123]+p[124]+p[125]+p[126]+p[127];
+	}
+	for (next = p + 16; (void*)next <= (void*)end; p = next, next += 16) {
+		sum +=
+			p[0]+p[1]+p[2]+p[3]+p[4]+p[5]+p[6]+p[7]+
+			p[8]+p[9]+p[10]+p[11]+p[12]+p[13]+p[14]+
+			p[15];
+	}
+	for (next = p + 1; (void*)next <= (void*)end; p = next, next++) {
+		sum += *p;
+	}
+	return sum;
+}
+
+void
+touch(char *buf, int nbytes)
+{
+	static	psize;
+
+	if (!psize) {
+		psize = getpagesize();
+	}
+	while (nbytes > 0) {
+		*buf = 1;
+		buf += psize;
+		nbytes -= psize;
+	}
+}
+
+size_t*
+permutation(int max, int scale)
+{
+	size_t	i, v;
+	static size_t r = 0;
+	size_t*	result = (size_t*)malloc(max * sizeof(size_t));
+
+	if (result == NULL) return NULL;
+
+	for (i = 0; i < max; ++i) {
+		result[i] = i * (size_t)scale;
+	}
+
+	if (r == 0)
+		r = (getpid()<<6) ^ getppid() ^ rand() ^ (rand()<<10);
+
+	/* randomize the sequence */
+	for (i = max - 1; i > 0; --i) {
+		r = (r << 1) ^ rand();
+		v = result[r % (i + 1)];
+		result[r % (i + 1)] = result[i];
+		result[i] = v;
+	}
+
+#ifdef _DEBUG
+	fprintf(stderr, "permutation(%d): {", max);
+	for (i = 0; i < max; ++i) {
+	  fprintf(stderr, "%d", result[i]);
+	  if (i < max - 1) 
+	    fprintf(stderr, ",");
+	}
+	fprintf(stderr, "}\n");
+	fflush(stderr);
+#endif /* _DEBUG */
+
+	return (result);
+}
+
+int
+cp(char* src, char* dst, mode_t mode)
+{
+	int sfd, dfd;
+	char buf[8192];
+	ssize_t size;
+
+	if ((sfd = open(src, O_RDONLY)) < 0) {
+		return -1;
+	}
+	if ((dfd = open(dst, O_CREAT|O_TRUNC|O_RDWR, mode)) < 0) {
+		return -1;
+	}
+	while ((size = read(sfd, buf, 8192)) > 0) {
+		if (write(dfd, buf, size) < size) return -1;
+	}
+	fsync(dfd);
+	close(sfd);
+	close(dfd);
+}
+
+#if defined(hpux) || defined(__hpux)
+int
+getpagesize()
+{
+	return (sysconf(_SC_PAGE_SIZE));
+}
+#endif
+
+#ifdef WIN32
+int
+getpagesize()
+{
+	SYSTEM_INFO s;
+
+	GetSystemInfo(&s);
+	return ((int)s.dwPageSize);
+}
+
+LARGE_INTEGER
+getFILETIMEoffset()
+{
+	SYSTEMTIME s;
+	FILETIME f;
+	LARGE_INTEGER t;
+
+	s.wYear = 1970;
+	s.wMonth = 1;
+	s.wDay = 1;
+	s.wHour = 0;
+	s.wMinute = 0;
+	s.wSecond = 0;
+	s.wMilliseconds = 0;
+	SystemTimeToFileTime(&s, &f);
+	t.QuadPart = f.dwHighDateTime;
+	t.QuadPart <<= 32;
+	t.QuadPart |= f.dwLowDateTime;
+	return (t);
+}
+
+int
+gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+	LARGE_INTEGER			t;
+	FILETIME			f;
+	double					microseconds;
+	static LARGE_INTEGER	offset;
+	static double			frequencyToMicroseconds;
+	static int				initialized = 0;
+	static BOOL				usePerformanceCounter = 0;
+
+	if (!initialized) {
+		LARGE_INTEGER performanceFrequency;
+		initialized = 1;
+		usePerformanceCounter = QueryPerformanceFrequency(&performanceFrequency);
+		if (usePerformanceCounter) {
+			QueryPerformanceCounter(&offset);
+			frequencyToMicroseconds = (double)performanceFrequency.QuadPart / 1000000.;
+		} else {
+			offset = getFILETIMEoffset();
+			frequencyToMicroseconds = 10.;
+		}
+	}
+	if (usePerformanceCounter) QueryPerformanceCounter(&t);
+	else {
+		GetSystemTimeAsFileTime(&f);
+		t.QuadPart = f.dwHighDateTime;
+		t.QuadPart <<= 32;
+		t.QuadPart |= f.dwLowDateTime;
+	}
+
+	t.QuadPart -= offset.QuadPart;
+	microseconds = (double)t.QuadPart / frequencyToMicroseconds;
+	t.QuadPart = microseconds;
+	tv->tv_sec = t.QuadPart / 1000000;
+	tv->tv_usec = t.QuadPart % 1000000;
+	return (0);
+}
+#endif
diff --git a/performance/lmbench3/src/lib_udp.c b/performance/lmbench3/src/lib_udp.c
new file mode 100644
index 0000000..4e4a5a6
--- /dev/null
+++ b/performance/lmbench3/src/lib_udp.c
@@ -0,0 +1,96 @@
+/*
+ * udp_lib.c - routines for managing UDP connections
+ *
+ * %W% %G%
+ *
+ * Copyright (c) 1994 Larry McVoy.
+ */
+#define		_LIB /* bench.h needs this */
+#include	"bench.h"
+
+/*
+ * Get a UDP socket, bind it, figure out the port,
+ * and advertise the port as program "prog".
+ *
+ * XXX - it would be nice if you could advertise ascii strings.
+ */
+int
+udp_server(u_long prog, int rdwr)
+{
+	int	sock;
+	struct	sockaddr_in s;
+
+	if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) {
+		perror("socket");
+		exit(1);
+	}
+	sock_optimize(sock, rdwr);
+	bzero((void*)&s, sizeof(s));
+	s.sin_family = AF_INET;
+#ifdef	NO_PORTMAPPER
+	s.sin_port = htons(prog);
+#endif
+	if (bind(sock, (struct sockaddr*)&s, sizeof(s)) < 0) {
+		perror("bind");
+		exit(2);
+	}
+#ifndef	NO_PORTMAPPER
+	(void)pmap_unset(prog, (u_long)1);
+	if (!pmap_set(prog, (u_long)1, (u_long)IPPROTO_UDP,
+	    (unsigned short)sockport(sock))) {
+		perror("pmap_set");
+		exit(5);
+	}
+#endif
+	return (sock);
+}
+
+/*
+ * Unadvertise the socket
+ */
+void
+udp_done(int prog)
+{
+	(void)pmap_unset((u_long)prog, (u_long)1);
+}
+
+/*
+ * "Connect" to the UCP socket advertised as "prog" on "host" and
+ * return the connected socket.
+ */
+int
+udp_connect(char *host, u_long prog, int rdwr)
+{
+	struct hostent *h;
+	struct sockaddr_in sin;
+	int	sock;
+	u_short	port;
+
+	if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) {
+		perror("socket");
+		exit(1);
+	}
+	sock_optimize(sock, rdwr);
+	if (!(h = gethostbyname(host))) {
+		perror(host);
+		exit(2);
+	}
+	bzero((void *) &sin, sizeof(sin));
+	sin.sin_family = AF_INET;
+	bcopy((void*)h->h_addr, (void *) &sin.sin_addr, h->h_length);
+#ifdef	NO_PORTMAPPER
+	sin.sin_port = htons(prog);
+#else
+	port = pmap_getport(&sin, prog, (u_long)1, IPPROTO_UDP);
+	if (!port) {
+		perror("lib UDP: No port found");
+		exit(3);
+	}
+	sin.sin_port = htons(port);
+#endif
+	if (connect(sock, (struct sockaddr*)&sin, sizeof(sin)) < 0) {
+		perror("connect");
+		exit(4);
+	}
+	return (sock);
+}
diff --git a/performance/lmbench3/src/lib_udp.h b/performance/lmbench3/src/lib_udp.h
new file mode 100644
index 0000000..d414d52
--- /dev/null
+++ b/performance/lmbench3/src/lib_udp.h
@@ -0,0 +1,12 @@
+#include	<sys/types.h>
+#include	<sys/socket.h>
+#include	<netinet/in.h>
+#include	<netdb.h>
+#include	<arpa/inet.h>
+
+int	udp_server(u_long prog, int rdwr);
+void	udp_done(int prog);
+int	udp_connect(char *host, u_long prog, int rdwr);
+void	sock_optimize(int sock, int rdwr);
+int	sockport(int);
+
diff --git a/performance/lmbench3/src/lib_unix.c b/performance/lmbench3/src/lib_unix.c
new file mode 100644
index 0000000..bd588cd
--- /dev/null
+++ b/performance/lmbench3/src/lib_unix.c
@@ -0,0 +1,97 @@
+/*
+ * unix_lib.c - routines for managing UNIX connections.
+ *
+ * Positive port/program numbers are RPC ports, negative ones are UNIX ports.
+ *
+ * Copyright (c) 1994-1996 Larry McVoy.
+ */
+#define		_LIB /* bench.h needs this */
+#include	"bench.h"
+
+/*
+ * Get a UNIX socket, bind it.
+ */
+int
+unix_server(char *path)
+{
+	int	sock;
+	struct	sockaddr_un s;
+
+#ifdef	LIBUNIX_VERBOSE
+	fprintf(stderr, "unix_server(%s, %u)\n", prog, rdwr);
+#endif
+	if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+		perror("socket");
+		exit(1);
+	}
+	bzero((void*)&s, sizeof(s));
+	s.sun_family = AF_UNIX;
+	strcpy(s.sun_path, path);
+	if (bind(sock, (struct sockaddr*)&s, sizeof(s)) < 0) {
+		perror("bind");
+		exit(2);
+	}
+	if (listen(sock, 100) < 0) {
+		perror("listen");
+		exit(4);
+	}
+	return (sock);
+}
+
+/*
+ * Unadvertise the socket
+ */
+int
+unix_done(int sock, char *path)
+{
+	close(sock);
+	unlink(path);
+	return (0);
+}
+
+/*
+ * Accept a connection and return it
+ */
+int
+unix_accept(int sock)
+{
+	struct	sockaddr_un s;
+	int	newsock, namelen;
+
+	namelen = sizeof(s);
+	bzero((void*)&s, namelen);
+
+retry:
+	if ((newsock = accept(sock, (struct sockaddr*)&s, &namelen)) < 0) {
+		if (errno == EINTR)
+			goto retry;
+		perror("accept");
+		exit(6);
+	}
+	return (newsock);
+}
+
+/*
+ * Connect to the UNIX socket advertised as "path" and
+ * return the connected socket.
+ */
+int
+unix_connect(char *path)
+{
+	struct	sockaddr_un s;
+	int	sock;
+
+	if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+		perror("socket");
+		exit(1);
+	}
+	bzero((void*)&s, sizeof(s));
+	s.sun_family = AF_UNIX;
+	strcpy(s.sun_path, path);
+	if (connect(sock, (struct sockaddr*)&s, sizeof(s)) < 0) {
+		perror("connect");
+		exit(4);
+	}
+	return (sock);
+}
+
diff --git a/performance/lmbench3/src/lib_unix.h b/performance/lmbench3/src/lib_unix.h
new file mode 100644
index 0000000..859e472
--- /dev/null
+++ b/performance/lmbench3/src/lib_unix.h
@@ -0,0 +1,8 @@
+/* lib_unix.c */
+#ifndef	_LIB_UNIX_H_
+#define	_LIB_UNIX_H_
+int unix_server(char *path);
+int unix_done(int sock, char *path);
+int unix_accept(int sock);
+int unix_connect(char *path);
+#endif
diff --git a/performance/lmbench3/src/line.c b/performance/lmbench3/src/line.c
new file mode 100644
index 0000000..3b5314d
--- /dev/null
+++ b/performance/lmbench3/src/line.c
@@ -0,0 +1,68 @@
+/*
+ * line.c - guess the cache line size
+ *
+ * usage: line
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+/*
+ * Assumptions:
+ *
+ * 1) Cache lines are a multiple of pointer-size words
+ * 2) Cache lines are no larger than 1/4 a page size
+ * 3) Pages are an even multiple of cache lines
+ */
+int
+main(int ac, char **av)
+{
+	int	i, j, l;
+	int	verbose = 0;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int	c;
+	size_t	maxlen = 64 * 1024 * 1024;
+	struct mem_state state;
+	char   *usage = "[-v] [-W <warmup>] [-N <repetitions>][-M len[K|M]]\n";
+
+	state.line = sizeof(char*);
+	state.pagesize = getpagesize();
+
+	while (( c = getopt(ac, av, "avM:W:N:")) != EOF) {
+		switch(c) {
+		case 'v':
+			verbose = 1;
+			break;
+		case 'M':
+			maxlen = bytes(optarg);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if ((l = line_find(maxlen, warmup, repetitions, &state)) > 0) {
+		if (verbose) {
+			printf("cache line size: %d bytes\n", l);
+		} else {
+			printf("%d\n", l);
+		}
+	}
+
+	return (0);
+}
diff --git a/performance/lmbench3/src/lmdd.1 b/performance/lmbench3/src/lmdd.1
new file mode 100644
index 0000000..a1e7f7e
--- /dev/null
+++ b/performance/lmbench3/src/lmdd.1
@@ -0,0 +1,131 @@
+.\" %W% %G%
+.TH LMDD 1 
+.SH NAME
+lmdd \- move io for performance and debugging tests
+.SH SYNOPSIS
+.B lmdd
+[
+.IB option = value
+] .\|.\|.
+.SH DESCRIPTION
+.B lmdd
+copies a specified input file to a specified output with possible
+conversions.  This program is primarily useful for timing I/O since it
+prints out the timing statistics after completing.
+.SH OPTIONS
+.TP 15
+.BI if= name
+Input file is taken from
+.IR name ;
+.I internal
+is the default.
+.I internal
+is a special file that acts like Sun's 
+.IR /dev/zero ,
+i.e., it provides a buffer of zeros without doing a system call to get them.
+.TP 
+.BI of= name
+Output file is taken from
+.IR name ;
+.I internal
+is the default.
+.I internal
+is a special file that acts like 
+.IR /dev/null ,
+without doing a system call to get rid of the data.
+.TP 
+.BI bs= n
+Input and output block size
+.I n
+bytes (default 8192).  Note that this is different from dd(1), it has
+a 512 byte default.   Also note that the block size can be followed
+by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024),
+respectively.
+.TP 
+.BI ipat= n
+If 
+.B n
+is non zero, expect a known pattern in the file (see opat).  Mismatches
+will be displayed as "ERROR: off=%d want=%x got=%x".  The pattern is
+a sequence of 4 byte integers with the first 0, second 1, and so on.
+The default is not to check for the pattern.
+.TP
+.BI opat= n
+If 
+.B n
+is non zero, generate a known pattern on the output stream.  Used for
+debugging file system correctness.
+The default is not to generate the pattern.
+.TP 
+.BI mismatch= n
+If 
+.B n
+is non zero, stop at the first mismatched value.  Used with ipat.
+.TP 
+.BI skip= n
+Skip
+.IR n ""
+input blocks before starting copy.
+.TP 
+.BI fsync= n
+If
+.I n
+is non-zero, call fsync(2) on the output file before exiting or printing
+timing statistics.
+.TP 
+.BI sync= n
+If
+.I n
+is non-zero, call sync(2) before exiting or printing
+timing statistics.
+.TP 
+.BI rand= n
+This argument, by default off, turns on random behavior.  The argument is
+not a flag, it is a size, that size is used as the upper bound for the 
+seeks.
+Also note that the block size can be followed
+by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024),
+.TP 
+.BI flush= n
+If
+.I n
+is non-zero and mmap(2) is available, call msync(2) to invalidate the
+output file.  This flushes the file to disk so that you don't have
+unmount/mount.  It is not as good as mount/unmount because it just
+flushes file pages - it misses the indirect blocks which are still
+cached.  Not supported on all systems, compile time option.
+.TP 
+.BI rusage= n
+If
+.I n
+is non-zero, print rusage statistics as well as timing statistics.
+Not supported on all systems, compile time option.
+.TP 
+.BI count= n
+Copy only
+.IR n ""
+input records.
+.SH EXAMPLES
+.LP
+This is the most common usage, the intent is to measure disk performance.
+The disk is a spare partition mounted on /spare.
+.sp
+.nf
+.in +4
+# mount /spare
+# lmdd if=internal of=/spare/XXX count=1000 fsync=1
+7.81 MB in 3.78 seconds (2.0676 MB/sec)
+
+: Flush cache
+# umount /spare
+# mount /spare
+
+# lmdd if=/spare/XXX of=internal 
+7.81 MB in 2.83 seconds (2.7611 MB/sec)
+.in
+.sp
+.fi
+.SH AUTHOR
+Larry McVoy, lm@xxxxxxx
+.br
+Not copyrighted.
diff --git a/performance/lmbench3/src/lmdd.c b/performance/lmbench3/src/lmdd.c
new file mode 100644
index 0000000..419f03f
--- /dev/null
+++ b/performance/lmbench3/src/lmdd.c
@@ -0,0 +1,893 @@
+char	*id = "$Id: lmdd.c,v 1.23 1997/12/01 23:47:59 lm Exp $\n";
+/*
+ * defaults:
+ *	bs=8k
+ *	count=forever
+ *	if=internal
+ *	of=internal
+ *	ipat=0
+ *	opat=0
+ *	mismatch=0
+ *	rusage=0
+ *	flush=0
+ *	rand=0
+ *	print=0
+ *	direct=0
+ *	rt=0
+ *	rtmax=0
+ *	wtmax=0
+ *	rtmin=0
+ *	wtmin=0
+ *	label=""
+ * shorthands:
+ *	k, m, g are 2^10, 2^20, 2^30 multipliers.
+ *	K, M, G are 10^3, 10^6, 10^9 multipliers.
+ *	recognizes "internal" as an internal /dev/zero /dev/null file.
+ *
+ * Copyright (c) 1994-1998 by Larry McVoy.  All rights reserved.
+ * See the file COPYING for the licensing terms.
+ *
+ * TODO - rewrite this entire thing from scratch.  This is disgusting code.
+ */
+
+#ifndef __Lynx__
+#define	FLUSH
+#endif
+
+#include	<fcntl.h>
+#include	<stdio.h>
+#include	<stdlib.h>
+#include	<signal.h>
+#include	<string.h>
+#include	<unistd.h>
+#include	<sys/types.h>
+#include	<sys/wait.h>
+#include	<sys/time.h>
+#include	"bench.h"
+
+#undef ALIGN
+#define ALIGN(x, bs)    ((x + (bs - 1)) & ~(bs - 1))
+
+#ifdef	FLUSH
+#include	<sys/mman.h>
+#include	<sys/stat.h>
+void		flush(void);
+#endif
+
+#define	USE_VALLOC
+#ifdef	USE_VALLOC
+#define	VALLOC	valloc
+#else
+#define	VALLOC	malloc
+#endif
+
+#ifdef	__sgi
+#	define	LSEEK(a,b,c)	(uint64)lseek64(a, (off64_t)b, c)
+#	define	ATOL(s)		atoll(s)
+#else
+#	define	LSEEK(a,b,c)	(uint64)lseek(a, b, c)
+#	define	ATOL(s)		atol(s)
+#endif
+
+
+int     awrite, poff, out, Print, Fsync, Sync, Flush, Bsize, ru;
+uint64	Start, End, Rand, int_count;
+int	hash;
+int	Realtime, Notrunc;
+int	Rtmax, Rtmin, Wtmax, Wtmin;
+int	rthist[12];		/* histogram of read times */
+int	wthist[12];		/* histogram of write times */
+char	*Label;
+uint64	*norepeat;
+int	norepeats = -1;
+#ifdef	USE_BDS
+	bds_msg	*m1, *m2;
+#endif
+
+uint64	getarg();
+int	been_there(uint64 off);
+int	getfile(char *s, int ac, char **av);
+
+char   *cmds[] = {
+	"bs",			/* block size */
+	"bufs",			/* use this many buffers round robin */
+	"count",		/* number of blocks */
+#ifdef	DBG
+	"debug",		/* set external variable "dbg" */
+#endif
+#ifdef	O_DIRECT
+	"direct",		/* direct I/O on input and output */
+	"idirect",		/* direct I/O on input */
+	"odirect",		/* direct I/O on output */
+#endif
+#ifdef	FLUSH
+	"flush",		/* map in out and invalidate (flush) */
+#endif
+	"fork",			/* fork to do write I/O */
+	"fsync",		/* fsync output before exit */
+	"if",			/* input file */
+	"ipat",			/* check input for pattern */
+	"label",		/* prefix print out with this */
+	"mismatch",		/* stop at first mismatch */
+	"move",			/* instead of count, limit transfer to this */
+	"of",			/* output file */
+	"opat",			/* generate pattern on output */
+	"print",		/* report type */
+	"rand",			/* do randoms over the specified size */
+				/* must be power of two, not checked */
+	"poff",			/* Print the offsets as we do the io. */
+#ifdef	RUSAGE
+	"rusage",		/* dump rusage stats */
+#endif
+	"skip",			/* skip this number of blocks */
+	"sync",			/* sync output before exit */
+	"touch",		/* touch each buffer after the I/O */
+#if	!defined(hpux)
+	"usleep",		/* sleep this many usecs between I/O */
+#endif
+	"hash",			/* hash marks like FTP */
+	"append",		/* O_APPEND */
+	"rtmax",		/* read latency histogram max in mills */
+	"wtmax",		/* write latency histogram max in mills */
+	"rtmin",		/* read latency histogram max in mills */
+	"wtmin",		/* write latency histogram max in mills */
+	"realtime",		/* create files as XFS realtime files */
+	"notrunc",		/* overwrite rather than truncing out file */
+	"end",			/* limit randoms to this size near the
+				 * Rand endpoints. */
+	"start",		/* Add this to Rand */
+	"time",			/* Run for this many seconds only. */
+	"srand",		/* Seed the random number generator */
+	"padin",		/* Pad an extra untimed block_size read */
+#ifdef	USE_BDS
+	"awrite",		/* use async writes and pipeline them. */
+#endif
+	"norepeat",		/* don't ever do the same I/O twice */
+#ifdef	sgi
+	"mpin",			/* pin the buffer */
+#endif
+	"timeopen",		/* include open time in results */
+	"nocreate",		/* just open for writing, don't create/trunc it */
+#ifdef	O_SYNC
+	"osync",		/* O_SYNC */
+#endif
+	0,
+};
+
+
+void error(char *);
+void    done();
+#ifdef	DBG
+extern int dbg;
+#endif
+
+int 
+main(int ac, char **av)
+{
+	uint  *buf;
+	uint  *bufs[10];
+	int	nbufs, nextbuf = 0;
+	int     Fork, misses, mismatch, outpat, inpat, in, timeopen, gotcnt;
+	int	slp;
+	uint64	skip, size, count;
+	void	chkarg();
+	int     i;
+	uint64	off = 0;
+	int	touch;
+	int	time;
+	int	mills;
+	int	pad_in;
+	int	pid = 0;
+	struct timeval	start_tv;
+	struct timeval	stop_tv;
+
+	if (sizeof(int) != 4) {
+		fprintf(stderr, "sizeof(int) != 4\n");
+		exit(1);
+	}
+	for (i = 1; i < ac; ++i) {
+		chkarg(av[i]);
+	}
+	signal(SIGINT, done);
+	signal(SIGALRM, done);
+	misses = mismatch = getarg("mismatch=", ac, av);
+	inpat = getarg("ipat=", ac, av);
+	outpat = getarg("opat=", ac, av);
+	Bsize = getarg("bs=", ac, av);
+	if (Bsize < 0)
+		Bsize = 8192;
+#if	!defined(hpux)
+	slp = getarg("usleep=", ac, av);
+#endif
+	Fork = getarg("fork=", ac, av);
+	Fsync = getarg("fsync=", ac, av);
+	Sync = getarg("sync=", ac, av);
+	Rand = getarg("rand=", ac, av);
+	Start = getarg("start=", ac, av);
+	End = getarg("end=", ac, av);
+	time = getarg("time=", ac, av);
+	if ((End != -1) && (Rand != -1) && (End > Rand)) {
+		End = Rand;
+	}
+	if (getarg("srand=", ac, av) != -1) {
+		srand48((long)getarg("srand=", ac, av));
+	}
+	poff = getarg("poff=", ac, av) != -1;
+	Print = getarg("print=", ac, av);
+	nbufs = getarg("bufs=", ac, av);
+	Realtime = getarg("realtime=", ac, av);
+	Rtmax = getarg("rtmax=", ac, av);
+	if ((Rtmax != -1) && (Rtmax < 10))
+		Rtmax = 10;
+	Rtmin = getarg("rtmin=", ac, av);
+	if ((Rtmax != -1) && (Rtmin == -1)) {
+		Rtmin = 0;
+	}
+	Wtmax = getarg("wtmax=", ac, av);
+	if ((Wtmax != -1) && (Wtmax < 10))
+		Wtmax = 10;
+	Wtmin = getarg("wtmin=", ac, av);
+	if ((Wtmax != -1) && (Wtmin == -1)) {
+		Wtmin = 0;
+	}
+	if ((Rtmin && !Rtmax) || (Wtmin && !Wtmax)) {
+		fprintf(stderr, "Need a max to go with that min.\n");
+		exit(1);
+	}
+	if ((Rtmin > Rtmax) || (Wtmin > Wtmax)) {
+		fprintf(stderr,
+		    "min has to be less than max, R=%d,%d W=%d,%d\n",
+		    Rtmax, Rtmin, Wtmax, Wtmin);
+		exit(1);
+	}
+	timeopen = getarg("timeopen=", ac, av);
+	pad_in = getarg("padin=", ac, av);
+	if (pad_in == -1) pad_in = 0;
+	
+	if (nbufs == -1) nbufs = 1;
+	if (nbufs > 10) { printf("Too many bufs\n"); exit(1); }
+#ifdef	DBG
+	dbg = getarg("debug=", ac, av) != -1;
+#endif
+#ifdef	RUSAGE
+	ru = getarg("rusage=", ac, av);
+#endif
+	touch = getarg("touch=", ac, av) != -1;
+	hash = getarg("hash=", ac, av) != (uint64)-1;
+	Label = (char *)getarg("label=", ac, av);
+	count = getarg("count=", ac, av);
+	size = getarg("move=", ac, av);
+	if (size != (uint64)-1)
+		count = size / Bsize;
+	if (Rand != -1) {
+		size = Rand - Bsize;
+		size = ALIGN(size, Bsize);
+	}
+
+#ifdef	FLUSH
+	Flush = getarg("flush=", ac, av);
+#endif
+	if (count == (uint64)-1)
+		gotcnt = 0;
+	else
+		gotcnt = 1;
+	int_count = 0;
+	skip = getarg("skip=", ac, av);
+	if (getarg("norepeat=", ac, av) != -1) {
+		if (gotcnt) {
+			norepeat = (uint64*)calloc(count, sizeof(uint64));
+		} else {
+			norepeat = (uint64*)calloc(10<<10, sizeof(uint64));
+		}
+	}
+
+	if ((inpat != -1 || outpat != -1) && (Bsize & 3)) {
+		fprintf(stderr, "Block size 0x%x must be word aligned\n", Bsize);
+		exit(1);
+	}
+	if ((Bsize >> 2) == 0) {
+		fprintf(stderr, "Block size must be at least 4.\n");
+		exit(1);
+	}
+	for (i = 0; i < nbufs; i++) {
+		if (!(bufs[i] = (uint *) VALLOC((unsigned) Bsize))) {
+			perror("VALLOC");
+			exit(1);
+		}
+		bzero((char *) bufs[i], Bsize);
+#ifdef sgi
+		if (getarg("mpin=", ac, av) != -1) {
+			if (mpin((void *)bufs[i], (size_t)Bsize)) {
+				perror("mpin for adam");
+			}
+		}
+#endif
+	}
+
+	if (time != -1) {
+		alarm(time);
+	}
+	if (timeopen != -1) {
+		start(NULL);
+	}
+	in = getfile("if=", ac, av);
+	out = getfile("of=", ac, av);
+	if (timeopen == -1) {
+		start(NULL);
+	}
+	if ((Rtmax != -1) && in < 0) {
+		fprintf(stderr, "I think you wanted wtmax, not rtmax\n");
+		exit(1);
+	}
+	if ((Wtmax != -1) && out < 0) {
+		fprintf(stderr, "I think you wanted rtmax, not wtmax\n");
+		exit(1);
+	}
+	if (skip != (uint64)-1) {
+		off = skip;
+		off *= Bsize;
+		if (in >= 0) {
+			LSEEK(in, off, 0);
+		}
+		if (out >= 0) {
+			LSEEK(out, off, 0);
+		}
+		if (poff) {
+			fprintf(stderr, "%s ", p64sz(off));
+		}
+	}
+	for (;;) {
+		register int moved;
+
+		if (gotcnt && count-- <= 0) {
+			done();
+		}
+
+		/*
+		 * If End is set, it means alternate back and forth
+		 * between the end points of Rand, doing randoms within
+		 * the area 0..End and Rand-End..Rand
+		 */
+		if (End != -1) {
+			static	uint64 start = 0;
+
+			start = start ? 0 : Rand - End;
+			do {
+				off = drand48() * End;
+				off = ALIGN(off, Bsize);
+				off += start;
+				if (Start != -1) {
+					off += Start;
+				}
+			} while (norepeat && been_there(off));
+			if (norepeat) {
+				norepeat[norepeats++] = off;
+				if (!gotcnt && (norepeats == 10<<10)) {
+					norepeats = 0;
+				}
+			}
+			if (in >= 0) {
+				LSEEK(in, off, 0);
+			}
+			if (out >= 0) {
+				LSEEK(out, off, 0);
+			}
+		}
+		/*
+		 * Set the seek pointer if doing randoms
+		 */
+		else if (Rand != -1) {
+			do {
+				off = drand48() * (size - Bsize);
+				if (Start != -1) {
+					off += Start;
+				}
+				off = ALIGN(off, Bsize);
+			} while (norepeat && been_there(off));
+			if (norepeat) {
+				norepeat[norepeats++] = off;
+			}
+			if (!gotcnt && (norepeats == 10<<10)) {
+				norepeats = 0;
+			}
+			if (in >= 0) {
+				LSEEK(in, off, 0);
+			}
+			if (out >= 0) {
+				LSEEK(out, off, 0);
+			}
+		}
+		if (poff) {
+			fprintf(stderr, "%s ", p64sz(off));
+		}
+
+		buf = bufs[nextbuf];
+		if (++nextbuf == nbufs) nextbuf = 0;
+		if (in >= 0) {
+			if ((Rtmax != -1) || (Rtmin != -1)) {
+				start(&start_tv);
+			}
+			moved = read(in, buf, Bsize);
+			
+			if (pad_in) { /* ignore this run, restart clock */
+			    pad_in = 0;
+			    count++;
+			    start(NULL);
+			    continue;
+			}
+			
+			if ((Rtmax != -1) || (Rtmin != -1)) {
+				int mics = stop(&start_tv, &stop_tv);
+				
+				mills = mics / 1000;
+				if ((mills > Rtmax) || (mills < Rtmin)) {
+					fprintf(stderr,
+					  "READ: %.02f milliseconds offset %s\n",
+						((float)mics) / 1000,
+						p64sz(LSEEK(in, 0, SEEK_CUR)));
+				}
+				/*
+				 * Put this read time in the histogram.
+				 * The buckets are each 1/10th of Rtmax.
+				 */
+				if (mills >= Rtmax) {
+					rthist[11]++;
+				} else if (mills < Rtmin) {
+					rthist[0]++;
+				} else {
+					int	step = (Rtmax - Rtmin) / 10;
+					int	i;
+
+					for (i = 1; i <= 10; ++i) {
+						if (mills < i * step + Rtmin) {
+							rthist[i]++;
+							break;
+						}
+					}
+				}
+			}
+		} else {
+			moved = Bsize;
+		}
+		if (moved == -1) {
+			perror("read");
+		}
+		if (moved <= 0) {
+			done();
+		}
+		if (inpat != -1) {
+			register int foo, cnt;
+
+			for (foo = 0, cnt = moved/sizeof(int); cnt--; foo++) {
+				if (buf[foo] != (uint) (off + foo*sizeof(int))) {
+					fprintf(stderr,
+					    "off=%u want=%x got=%x\n",
+					    (uint)off,
+					    (uint)(off + foo*sizeof(int)),
+					    buf[foo]);
+					if (mismatch != -1 && --misses == 0) {
+						done();
+					}
+				}
+			}
+		}
+		if ((in >= 0) && touch) {
+			int	i;
+
+			for (i = 0; i < moved; i += 4096) {
+				((char *)buf)[i] = 0;
+			}
+		}
+		if (out >= 0) {
+			int	moved2;
+
+			if (Fork != -1) {
+				if (pid) {
+					waitpid(pid, 0, 0);
+				}
+				if ((pid = fork())) {
+					off += moved;
+					int_count += (moved >> 2);
+					continue;
+				}
+			}
+			if (outpat != -1) {
+				register int foo, cnt;
+
+				for (foo = 0, cnt = moved/sizeof(int);
+				    cnt--; foo++) {
+					buf[foo] =
+					    (uint)(off + foo*sizeof(int));
+				}
+			}
+			if ((Wtmax != -1) || (Wtmin != -1)) { 
+				start(&start_tv);
+			}
+#ifdef	USE_BDS
+			/*
+			 * The first time through, m1 & m2 are null.
+			 * The Nth time through, we start the I/O into
+			 * m2, and wait on m1, then switch.
+			 */
+			if (awrite) {
+				if (m1) {
+					m2 = bds_awrite(out, buf, moved);
+					moved2 = bds_adone(out, m1);
+					m1 = m2;
+				} else {
+					m1 = bds_awrite(out, buf, moved);
+					goto writedone;
+				}
+			} else {
+				moved2 = write(out, buf, moved);
+			}
+#else
+			moved2 = write(out, buf, moved);
+#endif
+
+			if (moved2 == -1) {
+				perror("write");
+			}
+			if (moved2 != moved) {
+				fprintf(stderr, "write: wanted=%d got=%d\n",
+				    moved, moved2);
+				done();
+			}
+			if ((Wtmax != -1) || (Wtmin != -1)) {
+				int mics = stop(&start_tv, &stop_tv);
+
+				mills = mics / 1000;
+				if ((mills > Wtmax) || (mills < Wtmin)) {
+					fprintf(stderr,
+					  "WRITE: %.02f milliseconds offset %s\n",
+						((float)mics) / 1000,
+						p64sz(LSEEK(out, 0, SEEK_CUR)));
+				}
+				/*
+				 * Put this write time in the histogram.
+				 * The buckets are each 1/10th of Wtmax.
+				 */
+				if (mills >= Wtmax) {
+					wthist[11]++;
+				} else if (mills < Wtmin) {
+					wthist[0]++;
+				} else {
+					int	step = (Wtmax - Wtmin) / 10;
+					int	i;
+
+					for (i = 1; i <= 10; ++i) {
+						if (mills < i * step + Wtmin) {
+							wthist[i]++;
+							break;
+						}
+					}
+				}
+			}
+
+			if (moved2 == -1) {
+				perror("write");
+			}
+			if (moved2 != moved) {
+				done();
+			}
+
+			if (touch) {
+				int	i;
+
+				for (i = 0; i < moved; i += 4096) {
+					((char *)buf)[i] = 0;
+				}
+			}
+		}
+#ifdef	USE_BDS
+writedone:	/* for the first async write */
+#endif
+		off += moved;
+		int_count += (moved >> 2);
+#if	!defined(hpux)
+		if (slp != -1) {
+			usleep(slp);
+		}
+#endif
+		if (hash) {
+			fprintf(stderr, "#");
+		}
+		if (Fork != -1) {
+			exit(0);
+		}
+	}
+}
+
+int
+been_there(uint64 off)
+{
+	register int i;
+
+	for (i = 0; i <= norepeats; ++i) {
+		if (off == norepeat[i]) {
+			fprintf(stderr, "norepeat on %u\n", (uint)off);
+			return (1);
+		}
+	}
+	return (0);
+}
+
+void 
+chkarg(char *arg)
+{
+	int	i;
+	char	*a, *b;
+
+	for (i = 0; cmds[i]; ++i) {
+		for (a = arg, b = cmds[i]; *a && *b && *a == *b; a++, b++)
+			;
+		if (*a == '=')
+			return;
+	}
+	fprintf(stderr, "Bad arg: %s, possible arguments are: ", arg);
+	for (i = 0; cmds[i]; ++i) {
+		fprintf(stderr, "%s ", cmds[i]);
+	}
+	fprintf(stderr, "\n");
+	exit(1);
+	/*NOTREACHED*/
+}
+
+void 
+done(void)
+{
+	int	i;
+	int	step;
+	int	size;
+
+#ifdef	USE_BDS
+	if (awrite && m1) {
+		bds_adone(out, m1);
+	}
+#endif
+	if (Sync > 0)
+		sync();
+	if (Fsync > 0)
+		fsync(out);
+#ifdef	FLUSH
+	if (Flush > 0)
+		flush();
+#endif
+	stop(NULL, NULL);
+#ifdef	RUSAGE
+	if (ru != -1)
+		rusage();
+#endif
+	if (hash || poff) {
+		fprintf(stderr, "\n");
+	}
+	if ((long)Label != -1) {
+		fprintf(stderr, "%s", Label);
+	}
+	int_count <<= 2;
+	switch (Print) {
+	    case 0:	/* no print out */
+	    	break;
+
+	    case 1:	/* latency type print out */
+		latency((uint64)(int_count / Bsize), (uint64)Bsize);
+		break;
+
+	    case 2:	/* microsecond per op print out */
+		micro("", (uint64)(int_count / Bsize));
+		break;
+
+	    case 3:	/* kb / sec print out */
+		kb(int_count);
+		break;
+
+	    case 4:	/* mb / sec print out */
+		mb(int_count);
+		break;
+
+	    case 5:	/* Xgraph output */
+		bandwidth(int_count, 1, 0);
+		break;
+
+	    default:	/* bandwidth print out */
+		bandwidth(int_count, 1, 1);
+		break;
+	}
+	if (Rtmax != -1) {
+		printf("READ operation latencies\n");
+		step = (Rtmax - Rtmin) / 10;
+		if (rthist[0]) {
+			printf("%d- ms: %d\n", Rtmin, rthist[0]);
+		}
+		for (i = 1, size = Rtmin; i <= 10; i++, size += step) {
+			if (!rthist[i])
+				continue;
+			printf("%d to %d ms: %d\n",
+			       size, size + step - 1, rthist[i]);
+		}
+		if (rthist[11]) {
+			printf("%d+ ms: %d\n", Rtmax, rthist[11]);
+		}
+	}
+	if (Wtmax != -1) {
+		printf("WRITE operation latencies\n");
+		step = (Wtmax - Wtmin) / 10;
+		if (wthist[0]) {
+			printf("%d- ms: %d\n", Wtmin, wthist[0]);
+		}
+		for (i = 1, size = Wtmin; i <= 10; i++, size += step) {
+			if (!wthist[i])
+				continue;
+			printf("%d to %d ms: %d\n",
+			       size, size + step - 1, wthist[i]);
+		}
+		if (wthist[11]) {
+			printf("%d+ ms: %d\n", Wtmax, wthist[11]);
+		}
+	}
+	exit(0);
+}
+
+uint64 
+getarg(char *s, int ac, char **av)
+{
+	register uint64 len, i;
+
+	len = strlen(s);
+
+	for (i = 1; i < ac; ++i) {
+		if (!strncmp(av[i], s, len)) {
+			register uint64 bs = ATOL(&av[i][len]);
+
+			switch (av[i][strlen(av[i]) - 1]) {
+			    case 'K': bs *= 1000; break;
+			    case 'k': bs <<= 10; break;
+			    case 'M': bs *= 1000000; break;
+			    case 'm': bs <<= 20; break;
+			    case 'G': bs *= 1000000000L; break;
+			    case 'g': bs <<= 30; break;
+			}
+
+			if (!strncmp(av[i], "label", 5)) {
+				return (uint64)(long)(&av[i][len]); /* HACK */
+			}
+			if (!strncmp(av[i], "bs=", 3)) {
+				return (uint64)(bs);
+			}
+			return (bs);
+		}
+	}
+	return ((uint64)-1);
+}
+
+char	*output;
+
+int 
+getfile(char *s, int ac, char **av)
+{
+	register int ret, len, i;
+	int	append = getarg("append=", ac, av) != -1;
+	int	notrunc = getarg("notrunc=", ac, av) != -1;
+	int	nocreate = getarg("nocreate=", ac, av) != -1;
+#ifdef	O_SYNC
+	int	osync = getarg("osync=", ac, av) != -1;
+#endif
+	int	oflags;
+
+	len = strlen(s);
+
+	for (i = 1; i < ac; ++i) {
+		if (!strncmp(av[i], s, len)) {
+			if (av[i][0] == 'o') {
+				if (!strcmp("of=internal", av[i]))
+					return (-2);
+				if (!strcmp("of=stdout", av[i]))
+					return (1);
+				if (!strcmp("of=1", av[i]))
+					return (1);
+				if (!strcmp("of=-", av[i]))
+					return (1);
+				if (!strcmp("of=stderr", av[i]))
+					return (2);
+				if (!strcmp("of=2", av[i]))
+					return (2);
+				oflags = O_WRONLY;
+				oflags |= (notrunc || append) ? 0 : O_TRUNC;
+				oflags |= nocreate ? 0 : O_CREAT;
+				oflags |= append ? O_APPEND : 0;
+#ifdef O_SYNC
+				oflags |= osync ? O_SYNC : 0;
+#endif
+				ret = open(&av[i][len], oflags,0644);
+#ifdef	O_DIRECT
+				if ((getarg("odirect=", ac, av) != -1) ||
+				    (getarg("direct=", ac, av) != -1)) {
+					close(ret);
+					ret = open(&av[i][len], oflags|O_DIRECT);
+					awrite =
+					    getarg("awrite=", ac, av) != -1;
+				}
+#endif
+				if (ret == -1)
+					error(&av[i][len]);
+#ifdef F_FSSETXATTR
+				if (Realtime == 1) {
+					struct fsxattr fsxattr;
+				
+					bzero(&fsxattr,sizeof(struct fsxattr));
+					fsxattr.fsx_xflags = 0x1;
+					if (fcntl(ret,F_FSSETXATTR,&fsxattr)){
+						printf("WARNING: Could not make %s a real time file\n",
+						       &av[i][len]);
+					}
+				}
+#endif
+				output = &av[i][len];
+				return (ret);
+			} else {
+				if (!strcmp("if=internal", av[i]))
+					return (-2);
+				if (!strcmp("if=stdin", av[i]))
+					return (0);
+				if (!strcmp("if=0", av[i]))
+					return (0);
+				if (!strcmp("if=-", av[i]))
+					return (0);
+				ret = open(&av[i][len], 0);
+#ifdef	O_DIRECT
+				if ((getarg("idirect=", ac, av) != -1) ||
+				    (getarg("direct=", ac, av) != -1)) {
+					close(ret);
+					ret = open(&av[i][len], O_RDONLY|O_DIRECT);
+				}
+#endif
+				if (ret == -1)
+					error(&av[i][len]);
+				return (ret);
+			}
+		}
+	}
+	return (-2);
+}
+
+#ifdef	FLUSH
+int 
+warning(char *s)
+{
+	if ((long)Label != -1) {
+		fprintf(stderr, "%s: ", Label);
+	}
+	perror(s);
+	return (-1);
+}
+
+void
+flush(void)
+{
+	int	fd;
+	struct	stat sb;
+	caddr_t	where;
+
+	if (output == NULL || (fd = open(output, 2)) == -1) {
+		warning("No output file");
+		return;
+	}
+	if (fstat(fd, &sb) == -1 || sb.st_size == 0) {
+		warning(output);
+		return;
+	}
+	where = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+	msync(where, sb.st_size, MS_INVALIDATE);
+	munmap(where, sb.st_size);
+}
+#endif
+
+void 
+error(char *s)
+{
+	if ((long)Label != -1) {
+		fprintf(stderr, "%s: ", Label);
+	}
+	perror(s);
+	exit(1);
+}
diff --git a/performance/lmbench3/src/lmhttp.c b/performance/lmbench3/src/lmhttp.c
new file mode 100644
index 0000000..00bd4b0
--- /dev/null
+++ b/performance/lmbench3/src/lmhttp.c
@@ -0,0 +1,397 @@
+/*
+ * http_srv.c - simple HTTP "server"
+ *
+ * Only implements the simplest GET operation.
+ *
+ * usage: http_srv [-f#] [-l] [-d] [port]
+ *
+ * Copyright (c) 1994-6 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Other authors: Steve Alexander, sca@xxxxxxx.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+#ifdef MAP_FILE
+#	define	MMAP_FLAGS	MAP_FILE|MAP_SHARED
+#else
+#	define	MMAP_FLAGS	MAP_SHARED
+#endif
+#define	MMAPS_BETTER	(4<<10)	/* mmap is faster for sizes >= this */
+#define	LOGFILE		"/usr/tmp/lmhttp.log"
+
+char	*buf;
+char	*bufs[3];
+int	Dflg, dflg, nflg, lflg, fflg, zflg;
+int	data, logfile;
+void	die();
+void	worker();
+char	*http_time(void);
+char	*date(time_t *tt);
+char	*type(char *name);
+int	source(int sock);
+int	isdir(char *name);
+void	dodir(char *name, int sock);
+void	fake(int sock, char *buf, int size);
+void	rdwr(int fd, int sock, char *buf);
+int	mmap_rdwr(int from, int to, int size);
+void	logit(int sock, char *name, int size);
+
+
+int
+main(int ac, char **av)
+{
+	int	i, prog;
+#ifdef	sgi
+	int	ncpus = sysmp(MP_NPROCS);
+#endif
+
+	for (i = 1; i < ac; ++i) {
+		if (av[i][0] != '-') {
+			break;
+		}
+		switch (av[i][1]) {
+		    case 'D': Dflg = 1; break;	/* Allow directories */
+		    case 'd': dflg = 1; break;	/* debugging */
+		    case 'f': fflg = atoi(&av[i][2]);
+		   		 break;		/* # of threads */
+		    case 'l': lflg = 1; break;	/* logging */
+		    case 'n': nflg = 1; break;	/* fake file i/o */
+		    case 'z': zflg = 1; break;	/* all files are 0 size */
+		    default:
+			fprintf(stderr, "Barf.\n");
+			exit(1);
+		}
+	}
+	if (getenv("DOCROOT")) {
+		if (chdir(getenv("DOCROOT")) == -1) {
+			perror(getenv("DOCROOT"));
+			exit(1);
+		}
+	}
+	if (atoi(av[ac - 1]) != 0) {
+		prog = -atoi(av[ac - 1]);
+	} else {
+		prog = -80;
+	}
+	/*
+	 * Steve - why is this here?
+	 */
+	signal(SIGPIPE, SIG_IGN);
+	data = tcp_server(prog, SOCKOPT_REUSE);
+	bufs[0] = valloc(XFERSIZE);
+	bufs[1] = valloc(XFERSIZE);
+	bufs[2] = valloc(XFERSIZE);
+	logfile = open(LOGFILE, O_CREAT|O_APPEND|O_WRONLY, 0666);
+	signal(SIGINT, die);
+	signal(SIGHUP, die);
+	signal(SIGTERM, die);
+	for (i = 1; i < fflg; ++i) {
+		if (fork() <= 0) {
+			break;
+		}
+	}
+	handle_scheduler(i, 0, 0);
+	worker();
+	return(0);
+}
+
+void
+worker()
+{
+	int	newdata;
+	int	next = 0;
+
+	for (;;) {
+		buf = bufs[next];
+		if (++next == 3) next = 0;
+		newdata = tcp_accept(data, SOCKOPT_REUSE);
+		source(newdata);
+		close(newdata);
+	}
+}
+
+/*
+ * "Tue, 28 Jan 97 01:20:30 GMT";
+ *  012345678901234567890123456
+ */
+char	*http_time()
+{
+	time_t	tt;
+	static	time_t save_tt;
+	struct	tm *t;
+	static	struct tm save_tm;
+	static	char buf[100];
+
+	time(&tt);		/* costs 10 usecs */
+	if (tt == save_tt) {
+		return (buf);
+	}
+	save_tt = tt;
+	t = gmtime(&tt);	/* costs 21 usecs */
+	if (buf[0] && (tt - save_tt < 3600)) {
+		buf[22] = t->tm_sec / 10 + '0';
+		buf[21] = t->tm_sec % 10 + '0';
+		save_tm.tm_sec = t->tm_sec;
+		if (save_tm.tm_min == t->tm_min) {
+			return (buf);
+		}
+	}
+	save_tm = *t;
+	/* costs 120 usecs */
+	strftime(buf, sizeof(buf), "%a, %d %b %y %H:%M:%S %Z", t);
+	return(buf);
+}
+
+/*
+ * Input: dates that are probably within the last year.
+ * Output: Tue, 28 Jan 97 01:20:30 GMT
+ *
+ * Since it costs 150 usecs or so to do this on an Indy, it may pay to
+ * optimize this.  
+ */
+char	*
+date(time_t	*tt)
+{
+	return "Tue, 28 Jan 97 01:20:30 GMT";
+}
+
+char	*
+type(char *name)
+{
+	int	len = strlen(name);
+
+	if (!strcmp(&name[len - 4], ".gif")) {
+		return "image/gif";
+	}
+	if (!strcmp(&name[len - 5], ".jpeg")) {
+		return "image/jpeg";
+	}
+	if (!strcmp(&name[len - 5], ".html")) {
+		return "text/html";
+	}
+	if (Dflg && isdir(name)) {
+		return "text/html";
+	}
+	return "text/plain";
+}
+
+/*
+ * Read the file to be transfered.
+ * Write that file on the data socket.
+ * The caller closes the socket.
+ */
+int
+source(int sock)
+{
+	int	fd, n, size;
+	char	*s;
+	char	file[100];
+	char	hbuf[1024];
+	struct	stat sb;
+#define		name	&buf[5]
+
+	n = read(sock, buf, XFERSIZE);
+	if (n <= 0) {
+		perror("control nbytes");
+		return (-1);
+	}
+	buf[n] = 0;
+	if (dflg) printf("%.*s\n", n, buf); 
+	if (zflg) {
+		return (0);
+	}
+	if (!strncmp(buf, "EXIT", 4)) {
+		exit(0);
+	}
+	if (strncmp(buf, "GET /", 5)) {
+		perror(buf);
+		return(1);
+	}
+	for (s = buf; *s && *s != '\r' && *s != '\n'; s++)
+		;
+	*s = 0;
+	for (s = name; *s && *s != ' '; s++) 
+		;
+	*s = 0;
+	if (lflg) strncpy(file, name, sizeof(file));
+	if (dflg) printf("OPEN %s\n", name);
+	fd = open(name, 0);
+	if (fd == -1) {
+error:		perror(name);
+		close(fd);
+		return (1);
+	}
+	if (fstat(fd, &sb) == -1) {
+		if (dflg) printf("Couldn't stat %s\n", name);
+		goto error;
+	}
+	size = sb.st_size;
+	n = sprintf(hbuf, "HTTP/1.0 200 OK\r\n%s\r\nServer: lmhttp/0.1\r\nContent-Type: %s\r\nLast-Modified: %s\r\n\r\n",
+	    http_time(), type(name), date(&sb.st_mtime));
+	if (write(sock, hbuf, n) != n) {
+		goto error;
+	}
+	if (Dflg && isdir(name)) {
+		dodir(name, sock);
+	} else if (nflg) {
+		fake(sock, buf, size);
+	} else if ((size > MMAPS_BETTER)) {	/* XXX */
+		if (mmap_rdwr(fd, sock, size) == -1) {
+			printf("%s mmap failed\n", name);
+		}
+	} else {
+		rdwr(fd, sock, buf);
+	}
+	if (lflg) logit(sock, file, size);
+	close(fd);
+	return(0);
+}
+#undef	name
+
+
+int
+isdir(char *name)
+{
+	struct	stat sb;
+	if (stat(name, &sb) == -1) {
+		return(0);
+	}
+	return (S_ISDIR(sb.st_mode));
+}
+
+#ifdef example
+<HTML><HEAD>
+<TITLE>Index of /pub/Linux</TITLE>
+</HEAD><BODY>
+<H1>Index of /pub/Linux</H1>
+<PRE><IMG SRC="/icons/blank.gif" ALT="     "> Name                   Last modified     Size  Description
+<HR>
+<IMG SRC="/icons/unknown.gif" ALT="[   ]"> <A HREF="!INDEX">!INDEX</A>                 19-Sep-97 03:20     3k  
+<IMG SRC="/icons/text.gif" ALT="[TXT]"> <A HREF="!INDEX.html">!INDEX.html</A>            19-Sep-97 03:20     6k  
+#endif
+
+void
+dodir(char *name, int sock)
+{
+	FILE	*p;
+	char	buf[1024];
+	char	path[1024];
+
+	if (dflg) printf("dodir(%s)\n", name);
+	sprintf(buf, "cd %s && ls -1a", name);
+	p = popen(buf, "r");
+	if (!p && dflg) printf("Couldn't popen %s\n", buf);
+	sprintf(buf, "\
+<HTML><HEAD>\n<TITLE>Index of /%s</TITLE></HEAD><BODY><H1>Index of /%s</H1>\n",
+	    name, name);
+	write(sock, buf, strlen(buf));
+	while (fgets(buf, sizeof(buf), p)) {
+		buf[strlen(buf) - 1] = 0;
+		sprintf(path, "/%s/%s", name, buf);
+		if (dflg) printf("\t%s\n", path);
+		write(sock, "<A HREF=\"", 9);
+		write(sock, path, strlen(path));
+		write(sock, "\">", 2);
+		write(sock, buf, strlen(buf));
+		write(sock, "</A><BR>\n", 9);
+	}
+	pclose(p);
+}
+
+void
+fake(int sock, char *buf, int size)
+{
+	int	n;
+
+	while (size > 0) {
+		n = write(sock, buf, size > XFERSIZE ? XFERSIZE : size);
+		if (n == -1) {
+			perror("write on socket");
+			return;
+		}
+		size -= n;
+	}
+}
+
+void
+rdwr(int fd, int sock, char *buf)
+{
+	int	nread;
+
+	while ((nread = read(fd, buf, XFERSIZE)) > 0) {
+		int	i;
+
+		for (i = 0; i < nread; ) {
+			int	nwrote = write(sock, buf, nread - i);
+
+			if (i < 0) {
+				exit(1);
+			}
+			i += nwrote;
+		}
+	}
+}
+
+int
+mmap_rdwr(int from, int to, int size)
+{
+	char	*buf;
+	int	done = 0, wrote;
+
+	buf = mmap(0, size, PROT_READ, MMAP_FLAGS, from, 0);
+	if ((long)buf == -1) {
+		perror("mmap");
+		return (-1);
+	}
+	do {
+		wrote = write(to, buf + done, size - done);
+		if (wrote == -1) {
+			perror("write");
+			break;
+		}
+		done += wrote;
+	} while (done < size);
+	if (munmap(buf, size) == -1) {
+		perror("unmap");
+	}
+	return (0);
+}
+
+static	char logbuf[64<<10];	/* buffer into here */
+static	int nbytes;		/* bytes buffered */
+
+/*
+ * HTTP server logging, compressed format.
+ */
+void
+logit(int sock, char *name, int size)
+{
+	struct	sockaddr_in sin;
+	int	len = sizeof(sin);
+	char	buf[1024 + 16];		/* maxpathlen + others */
+
+	if (getpeername(sock, (struct sockaddr*)&sin, &len) == -1) {
+		perror("getpeername");
+		return;
+	}
+	len = sprintf(buf, "%u %u %s %u\n",
+	    *((unsigned int*)&sin.sin_addr), (unsigned int)time(0), name, size);
+	if (nbytes + len >= sizeof(logbuf)) {
+		write(logfile, logbuf, nbytes);
+		nbytes = 0;
+	}
+	bcopy(buf, &logbuf[nbytes], len);
+	nbytes += len;
+}
+
+void die()
+{
+	if (nbytes) {
+		write(logfile, logbuf, nbytes);
+		nbytes = 0;
+	}
+	exit(1);
+}
diff --git a/performance/lmbench3/src/loop_o.c b/performance/lmbench3/src/loop_o.c
new file mode 100644
index 0000000..1cc4333
--- /dev/null
+++ b/performance/lmbench3/src/loop_o.c
@@ -0,0 +1,8 @@
+#include "bench.h"
+
+int
+main()
+{
+	printf("%.8f\n", l_overhead());
+	return (0);
+}
diff --git a/performance/lmbench3/src/memsize.c b/performance/lmbench3/src/memsize.c
new file mode 100644
index 0000000..e1d05be
--- /dev/null
+++ b/performance/lmbench3/src/memsize.c
@@ -0,0 +1,192 @@
+/*
+ * memsize.c - figure out how much memory we have to use.
+ *
+ * Usage: memsize [max_wanted_in_MB]
+ *
+ * Copyright (c) 1995 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+#define	CHK(x)	if ((x) == -1) { perror("x"); exit(1); }
+
+#ifndef	TOO_LONG
+#define	TOO_LONG	10	/* usecs */
+#endif
+
+int	alarm_triggered = 0;
+
+void	timeit(char *where, size_t size);
+static	void touchRange(char *p, size_t range, ssize_t stride);
+int	test_malloc(size_t size);
+void	set_alarm(uint64 usecs);
+void	clear_alarm();
+
+int
+main(int ac, char **av)
+{
+	char	*where;
+	char	*tmp;
+	size_t	size = 0;
+	size_t	max = 0;
+	size_t	delta;
+
+	if (ac == 2) {
+		max = size = bytes(av[1]) * 1024 * 1024;
+	}
+	if (max < 1024 * 1024) {
+		max = size = 1024 * 1024 * 1024;
+	}
+	/*
+	 * Binary search down and then binary search up
+	 */
+	for (where = 0; !test_malloc(size); size >>= 1) {
+		max = size;
+	}
+	/* delta = size / (2 * 1024 * 1024) */
+	for (delta = (size >> 21); delta > 0; delta >>= 1) {
+		uint64 sz = (uint64)size + (uint64)delta * 1024 * 1024;
+		if (max < sz) continue;
+		if (test_malloc(sz)) size = sz;
+	}
+	if (where = malloc(size)) {
+		timeit(where, size);
+		free(where);
+	}
+	exit (0);
+}
+
+void
+timeit(char *where, size_t size)
+{
+	int	sum = 0;
+	char	*end = where + size;
+	size_t	n;
+	size_t	s;
+	size_t	range;
+	size_t	incr = 1024 * 1024;
+	ssize_t	stride;
+	size_t	pagesize = getpagesize();
+
+	if (size < 1024*1024 - 16*1024) {
+		fprintf(stderr, "Bad size\n");
+		return;
+	}
+
+	range = 1024 * 1024;
+	incr = 1024 * 1024;
+	touchRange(where, range, pagesize);
+	for (range += incr; range <= size; range += incr) {
+		n = range / pagesize;
+		set_alarm(n * TOO_LONG);
+		touchRange(where + range - incr, incr, pagesize);
+		clear_alarm();
+		set_alarm(n * TOO_LONG);
+		start(0);
+		touchRange(where, range, pagesize);
+		sum = stop(0, 0);
+		clear_alarm();
+		if ((sum / n) > TOO_LONG || alarm_triggered) {
+			size = range - incr;
+			break;
+		}
+		for (s = 8 * 1024 * 1024; s <= range; s *= 2)
+			;
+		incr = s / 8;
+		if (range < size && size < range + incr) {
+			incr = size - range;
+		}
+		fprintf(stderr, "%dMB OK\r", range/(1024*1024));
+	}
+	fprintf(stderr, "\n");
+	printf("%d\n", (size>>20));
+}
+
+static void
+touchRange(char *p, size_t range, ssize_t stride)
+{
+	register char	*tmp = p + (stride > 0 ? 0 : range - 1);
+	register size_t delta = (stride > 0 ? stride : -stride);
+
+	while (range > delta - 1 && !alarm_triggered) {
+		*tmp = 0;
+		tmp += stride;
+		range -= delta;
+	}
+}
+
+int
+test_malloc(size_t size)
+{
+	int	fid[2];
+	int	result;
+	int	status;
+	void*	p;
+
+	if (pipe(fid) < 0) {
+		void* p = malloc(size);
+		if (!p) return 0;
+		free(p);
+		return 1;
+	}
+	if (fork() == 0) {
+		close(fid[0]);
+		p = malloc(size);
+		result = (p ? 1 : 0);
+		write(fid[1], &result, sizeof(int));
+		close(fid[1]);
+		if (p) free(p);
+		exit(0);
+	}
+	close(fid[1]);
+	if (read(fid[0], &result, sizeof(int)) != sizeof(int))
+		result = 0;
+	close(fid[0]);
+	wait(&status);
+	return result;
+}
+
+void
+gotalarm()
+{
+	alarm_triggered = 1;
+}
+
+void
+set_alarm(uint64 usecs)
+{
+	struct itimerval value;
+	struct sigaction sa;
+
+	alarm_triggered = 0;
+
+	sa.sa_handler = gotalarm;
+	sigemptyset(&sa.sa_mask);
+	sa.sa_flags = 0;
+	sigaction(SIGALRM, &sa, 0);
+
+	value.it_interval.tv_sec = 0;
+	value.it_interval.tv_usec = 0;
+	value.it_value.tv_sec = usecs / 1000000;
+	value.it_value.tv_usec = usecs % 1000000;
+
+	setitimer(ITIMER_REAL, &value, NULL);
+}
+
+void
+clear_alarm()
+{
+	struct itimerval value;
+
+	value.it_interval.tv_sec = 0;
+	value.it_interval.tv_usec = 0;
+	value.it_value.tv_sec = 0;
+	value.it_value.tv_usec = 0;
+
+	setitimer(ITIMER_REAL, &value, NULL);
+}
+
diff --git a/performance/lmbench3/src/mhz.c b/performance/lmbench3/src/mhz.c
new file mode 100644
index 0000000..210f6fc
--- /dev/null
+++ b/performance/lmbench3/src/mhz.c
@@ -0,0 +1,507 @@
+/*
+ * mhz.c - calculate clock rate and megahertz
+ *
+ * Usage: mhz [-c]
+ *
+ *******************************************************************
+ *
+ * Caveat emptor and other warnings
+ *
+ * This code must be compiled using the optimizer!  If you don't
+ * compile this using the optimizer, then many compilers don't
+ * make good use of the registers and your inner loops end up
+ * using stack variables, which is SLOW.  
+ *
+ * Also, it is sensitive to other processor load.  When running
+ * mhz with "rtprio" (real-time priority), I have never had mhz
+ * make a mistake on my machine.  At other times mhz has been
+ * wrong about 10% of the time.
+ *
+ * If there is too much noise/error in the data, then this program
+ * will usually return a clock speed that is too high.
+ *
+ *******************************************************************
+ * 
+ * Constraints
+ *
+ * mhz.c is meant to be platform independent ANSI/C code, and it 
+ * has as little platform dependent code as possible.  
+ *
+ * This version of mhz is designed to eliminate the variable 
+ * instruction counts used by different compilers on different 
+ * architectures and instruction sets.  It is also structured to
+ * be tightly interlocked so processors with super-scalar elements
+ * or dynamic instructure reorder buffers cannot overlap the
+ * execution of the expressions.
+ *
+ * We have to try and make sure that the code in the various
+ * inner loops does not fall out of the on-chip instruction cache
+ * and that the inner loop variables fit inside the register set.
+ * The i386 only has six addressable registers, so we had to make
+ * sure that the inner loop procedures had fewer variables so they
+ * would not spill onto the stack.
+ *
+ *******************************************************************
+ *
+ * Algorithm
+ *
+ * We can compute the CPU cycle time if we can get the compiler
+ * to generate (at least) two instruction sequences inside loops
+ * where the inner loop instruction counts are relatively prime.  
+ * We have several different loops to increase the chance that 
+ * two of them will be relatively prime on any given architecture.  
+ *
+ * This technique makes no assumptions about the cost of any single
+ * instruction or the number of instructions used to implement a
+ * given expression.  We just hope that the compiler gets at least
+ * two inner loop instruction sequences with lengths that are
+ * relatively prime.  The "relatively prime" makes the greatest
+ * common divisor method work.  If all the instructions sequences
+ * have a common factor (e.g. 2), then the apparent CPU speed will
+ * be off by that common factor.  Also, if there is too much
+ * variability in the data so there is no apparent least common
+ * multiple within the error bounds set in multiple_approx, then
+ * we simply return the maximum clock rate found in the loops.
+ *
+ * The processor's clock speed is the greatest common divisor
+ * of the execution frequencies of the various loops.  For
+ * example, suppose we are trying to compute the clock speed
+ * for a 120Mhz processor, and we have two loops:
+ *	SHR		--- two cycles to shift right
+ *	SHR;ADD		--- three cycles to SHR and add
+ * then the expression duration will be:
+ *	SHR		11.1ns (2 cycles/SHR)
+ *	SHR;ADD		16.6ns (3 cycles/SHR;ADD)
+ * so the greatest common divisor is 5.55ns and the clock speed
+ * is 120Mhz.  Aside from extraneous variability added by poor 
+ * benchmarking hygiene, this method should always work when we 
+ * are able to get loops with cycle counts that are relatively 
+ * prime.
+ *
+ * Suppose we are unlucky, and we have our two loops do
+ * not have relatively prime instruction counts.  Suppose
+ * our two loops are:
+ *	SHR		11.1ns (2 cycles/SHR)
+ *	SHR;ADD;SUB	22.2ns (4 cycles/SHR;ADD;SUB)
+ * then the greatest common divisor will be 11.1ns, so the clock
+ * speed will appear to be 60Mhz.
+ *
+ * The loops provided so far should have at least two relatively 
+ * prime loops on nearly all architectures.
+ *
+ *******************************************************************
+ *
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Silicon Graphics is gratefully acknowledged.
+ * Support for this development by Hewlett Packard is gratefully acknowledged.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ *
+ *******************************************************************
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+#include <math.h>
+
+typedef	long	TYPE;
+
+#define TEN(A)		A A A A A A A A A A
+#define HUNDRED(A)	TEN(A) TEN(A) TEN(A) TEN(A) TEN(A) \
+			TEN(A) TEN(A) TEN(A) TEN(A) TEN(A)
+
+#define MHZ(M, contents)						\
+char*									\
+name_##M()								\
+{									\
+	return #contents;						\
+}									\
+									\
+TYPE**									\
+_mhz_##M (register long n, register TYPE **p, 				\
+	  register TYPE a, register TYPE b)				\
+{ 									\
+	for (; n > 0; --n) {						\
+		HUNDRED(contents)					\
+	}								\
+	return p + a + b;						\
+}									\
+									\
+void									\
+mhz_##M(int enough)							\
+{									\
+	TYPE	__i = 1;						\
+	TYPE	*__x=(TYPE *)&__x, **__p=(TYPE **)__x, **__q = NULL;	\
+	_mhz_##M(1, __p, 1, 1);						\
+	BENCH1(__q = _mhz_##M(__n, __p, __i, __i); __n = 1;, enough)	\
+	use_pointer((void*)__q);					\
+	save_n(100 * get_n()); /* # of expressions executed */		\
+}
+
+MHZ(1, p=(TYPE**)*p;)
+MHZ(2, a^=a+a;)
+MHZ(3, a^=a+a+a;)
+MHZ(4, a>>=b;)
+MHZ(5, a>>=a+a;)
+MHZ(6, a^=a<<b;)
+MHZ(7, a^=a+b;)
+MHZ(8, a+=(a+b)&07;)
+MHZ(9, a^=n;b^=a;a|=b;)
+
+typedef void (*loop_f)(int);
+loop_f loops[] = {
+	mhz_1,
+	mhz_2,
+	mhz_3,
+	mhz_4,
+	mhz_5,
+	mhz_6,
+	mhz_7,
+	mhz_8,
+	mhz_9,
+};
+
+
+#define NTESTS	(sizeof(loops) / sizeof(loop_f))
+#define BIT_SET(A,bit) ((A) & 1 << (bit))
+
+
+/*
+ * This is used to filter out bad points (mostly ones that have had
+ * their inner loop optimized away).  Bad points are those with values
+ * less than 1/20th of the median value and more than 20 times the
+ * median value.
+ *
+ * filter_data returns the number of valid data points, and puts the
+ * valid points in the lower part of the values[] array.
+ */
+int
+filter_data(double values[], int size)
+{
+	int i;
+	int tests;
+	double median;
+	double *d = (double *)malloc((size + 1) * sizeof(double));
+
+	for (i = 0; i < size; ++i) d[i] = values[i];
+	qsort(d, size, sizeof(double), double_compare);
+
+	median = d[size/2];
+	if (size > 0 && size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0;
+
+	free(d);
+
+	/* if the data point is inside the envelope of acceptable 
+	 * results, then keep it, otherwise discard it
+	 */
+	for (i = 0, tests = 0; i < size; ++i)
+		if (0.05 * median < values[i] && values[i] < 20.0 * median) {
+			if (i > tests) values[tests] = values[i];
+			tests++;
+		}
+
+	return tests;
+}
+
+/*
+ * make sure that there are enough points with significantly
+ * different data values (greater than 5% difference) in the
+ * data subset.
+ */
+int
+classes(double values[], int size)
+{
+	int i;
+	double median;
+	double *d = (double *)malloc(size * sizeof(double));
+	int classid;
+
+	for (i = 0; i < size; ++i) d[i] = values[i];
+	qsort(d, size, sizeof(double), double_compare);
+
+	median = d[size/2];
+	if (size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0;
+
+	/* if the difference is less than 1/20th of the median, then
+	 * we assume that the two points are the same
+	 */
+	for (i = 1, classid = 1; i < size; ++i)
+	    if ((d[i] - d[i-1]) > 0.05 * median) classid++;
+
+	free(d);
+	return classid;
+}
+
+/*
+ * mode
+ *
+ * return the most common value (within 1MHz)
+ */
+int
+mode(double values[], int n)
+{
+	int	i, n_mode, n_curr;
+	int	mode, curr;
+
+	qsort(values, n, sizeof(double), double_compare);
+
+	n_mode = 1;
+	n_curr = 1;
+	mode = (int)(values[0] + 0.5);
+	curr = (int)(values[0] + 0.5);
+
+	for (i = 1; i < n; ++i) {
+		int v = (int)(values[i] + 0.5);
+		if (curr != v) {
+			curr = v;
+			n_curr = 0;
+		}
+		n_curr++;
+		if (n_curr > n_mode) {
+			mode = curr;
+			n_mode = n_curr;
+		}
+	}
+
+	return mode;
+}
+
+/*
+ * cross_values
+ *
+ * This routine will create new data points by subtracting pairs
+ * of data points.
+ */
+void
+cross_values(double values[], int size, double **cvalues, int *csize)
+{
+	int	i, j;
+
+	*cvalues = (double *)malloc(size * size * sizeof(double));
+	*csize = 0;
+
+	for (i = 0; i < size; ++i) {
+		(*cvalues)[(*csize)++] = values[i];
+		/* create new points with the differences */
+		for (j = i + 1; j < size; ++j) {
+			(*cvalues)[(*csize)++] = ABS(values[i] - values[j]);
+		}
+	}
+}
+
+
+/*
+ * gcd
+ *
+ * return the greatest common divisor of the passed values (within a
+ * margin of error because these are experimental results, not
+ * theoretical numbers).  We do this by guessing how many instructions
+ * are in each loop, and then trying to fit a straight line through
+ * the (instruction count, time) points.  The regression is of the
+ * form:
+ *
+ *	y = a + b * x
+ *
+ * The time for an individual instruction is "b", while "a" should
+ * be 0.  The trick is to figure out which guess is the right one!
+ *
+ * We assume that the gcd is the first value at which we have 
+ * significantly improved regression fit (as measured by chi2).
+ *
+ * We increase the number of experimental points (and generate
+ * more small points) by adding points for the differences between
+ * measured values (and compute the standard error appropriately).
+ *
+ * We want the regression line to go through the origin, so we
+ * add an artificial point at (0,0) with a tiny standard error.
+ */
+double 
+gcd(double values[], int size)
+{
+/* assumption: shortest inner loop has no more than this many instructions */
+#define	MAX_COUNT	6
+	int	i, n, count;
+	double	min, result, min_chi2 = 0.0, a, b, sig_a, sig_b, chi2;
+	double *y, *x = (double *)malloc(size * size * sizeof(double));
+
+	/* find the smallest value */
+	result = min = double_min(values, size);
+
+	/* create new points by subtracting each pair of values */
+	cross_values(values, size, &y, &n);
+
+	/* make sure the regression goes through the origin */
+	y[n++] = 0.0;
+
+	for (count = 1; count < MAX_COUNT; ++count) {
+		/* 
+		 * given the minimum loop has "count" instructions,
+		 * guess how many instructions each other loop contains
+		 */
+		for (i = 0; i < n; ++i) {
+			int m = (int)((double)count * y[i] / min + 0.5);
+			x[i] = (double)m;
+		}
+
+		/* find the regression of the samples */
+		regression(x, y, NULL, n, &a, &b, &sig_a, &sig_b, &chi2);
+
+		if (count == 1 || count * count * chi2 < min_chi2) {
+			result = b;
+			min_chi2 = chi2;
+		}
+	}
+	free(x);
+	free(y);
+	return result;
+}
+
+/*
+ * compute the gcd of many possible combinations of experimental values
+ * and return the mode of the results to reduce the impact
+ * of a few bad experimental measurements on the computed result.
+ *
+ * r	- pointer to the array of experimental results
+ * off	- offset of the result we want.  TRIES-1 == minimum result.
+ */
+int
+compute_mhz(result_t *r)
+{
+	int	i, j, mhz[2], n, subset, ntests;
+	double	data[NTESTS], results[1<<NTESTS];
+
+	for (i = 0; i < 2; ++i) {
+		for (subset = 0, ntests = 0; subset < (1<<NTESTS); ++subset) {
+			for (j = 0, n = 0; j < NTESTS; ++j)
+				if (BIT_SET(subset, j) && r[j].N > TRIES/2)
+					data[n++] = r[j].v[r[j].N-1-i].u / (double)r[j].v[r[j].N-1-i].n;
+			if (n < 2
+			    || (n = filter_data(data, n)) < 2
+			    ||classes(data, n) < 2) 
+				continue;
+			results[ntests++] = 1.0 / gcd(data, n);
+		}
+		mhz[i] = mode(results, ntests);
+	}
+	/* if the results agree within 1% or 1MHz, accept them */
+	if (ABS(mhz[0] - mhz[1]) / (double)mhz[0] <= 0.01 
+	    || ABS(mhz[0] - mhz[1]) <= 1)
+		return mhz[0];
+
+	return -1;
+}
+
+void
+save_data(result_t* data, result_t* data_save)
+{
+	int	i;
+
+	for (i = 0; i < NTESTS; ++i) {
+		data_save[i] = data[i];
+	}
+}
+
+void
+print_data(double mhz, result_t* data)
+{
+	int	i, j;
+	char	*CPU_name = "CPU";
+	char	*uname = "uname";
+	char	*email = "email";
+	int	speed = -1;
+	char	*names[NTESTS];
+
+	names[0] = name_1();
+	names[1] = name_2();
+	names[2] = name_3();
+	names[3] = name_4();
+	names[4] = name_5();
+	names[5] = name_6();
+	names[6] = name_7();
+	names[7] = name_8();
+	names[8] = name_9();
+
+	printf("/* \"%s\", \"%s\", \"%s\", %d, %.0f, %d, %f, %f */\n", 
+	       CPU_name, uname, email, speed, 
+	       mhz, get_enough(0), l_overhead(), t_overhead());
+	printf("result_t* data[] = { \n");
+	for (i = 0; i < NTESTS; ++i) {
+	    printf("\t/* %s */ { %d, {", names[i], data[i].N);
+	    for (j = 0; j < data[i].N; ++j) {
+		printf("\n\t\t{ /* %f */ %lu, %lu}", data[i].v[j].u / (100. * data[i].v[j].n), (unsigned long)data[i].v[j].u, (unsigned long)data[i].v[j].n);
+		if (j < TRIES - 1) printf(", ");
+	    }
+	    if (i < NTESTS - 1) printf("}},\n");
+	    else printf("}}\n");
+	}
+	printf("};\n");
+}
+
+int
+main(int ac, char **av)
+{
+	int	c, i, j, k, mhz = -1;
+	double	runtime;
+	result_t data[NTESTS];
+	result_t data_save[NTESTS];
+	char   *usage = "[-d] [-c]\n";
+
+	putenv("LOOP_O=0.0"); /* should be at most 1% */
+
+	runtime = (NTESTS * TRIES * 3 * get_enough(0)) / 1000000.;
+	if (runtime > 3.) {
+	  fprintf(stderr, "mhz: should take approximately %.0f seconds\n", runtime);
+	}
+
+	/* make three efforts to get reliable data */
+	for (i = 0; i < 3 && mhz < 0; ++i) {
+	    /* initialize the data arrays */
+	    for (j = 0; j < NTESTS; ++j)
+		insertinit(&data[j]);
+
+	    /*
+	     * collect the data; try to minimize impact of activity bursts
+	     * by putting NTESTS in the inner loop so a burst will affect
+	     * one data point for all expressions first, rather than all
+	     * data points for one expression.
+	     */
+	    for (j = 0; j < TRIES; ++j) {
+		for (k = 0; k < NTESTS; ++k) {
+		    (*loops[k])(0);
+		    insertsort(gettime(), get_n(), &data[k]);
+		}
+	    }
+	    save_data(data, data_save);
+	    mhz = compute_mhz(data);
+	}
+
+	while (( c = getopt(ac, av, "cd")) != EOF) {
+		switch(c) {
+		case 'c':
+			if (mhz > 0) {
+				printf("%.4f\n", 1000. / (double)mhz);
+				mhz = 0;
+			}
+			break;
+		case 'd':
+			print_data(mhz, data_save);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	if (mhz < 0) {
+		printf("-1 System too busy\n");
+		exit(1);
+	}
+
+	if (mhz > 0) {
+		printf("%d MHz, %.4f nanosec clock\n", 
+		       mhz, 1000. / (double)mhz);
+	}
+	exit(0);
+}
diff --git a/performance/lmbench3/src/msleep.c b/performance/lmbench3/src/msleep.c
new file mode 100644
index 0000000..e605d50
--- /dev/null
+++ b/performance/lmbench3/src/msleep.c
@@ -0,0 +1,21 @@
+#include "bench.h"
+
+int
+main(int ac, char **av)
+{
+#if	defined(sgi) || defined(sun) || defined(linux)
+	usleep(atoi(av[1]) * 1000);
+	return (0);
+#else
+	fd_set	set;
+	int	fd;
+	struct	timeval tv;
+
+	tv.tv_sec = 0;
+	tv.tv_usec = atoi(av[1]) * 1000;
+	FD_ZERO(&set);
+	FD_SET(0, &set);
+	select(1, &set, 0, 0, &tv);
+	return (0);
+#endif
+}
diff --git a/performance/lmbench3/src/names.h b/performance/lmbench3/src/names.h
new file mode 100644
index 0000000..ea7775c
--- /dev/null
+++ b/performance/lmbench3/src/names.h
@@ -0,0 +1,102 @@
+char *names[] = {
+"a",	"b",	"c",	"d",	"e",	"f",	"g",	"h",	"i",	"j",
+"k",	"l",	"m",	"n",	"o",	"p",	"q",	"r",	"s",	"t",
+"u",	"v",	"w",	"x",	"y",	"z",	"aa",	"ab",	"ac",	"ad",
+"ae",	"af",	"ag",	"ah",	"ai",	"aj",	"ak",	"al",	"am",	"an",
+"ao",	"ap",	"aq",	"ar",	"as",	"at",	"au",	"av",	"aw",	"ax",
+"ay",	"az",	"ba",	"bb",	"bc",	"bd",	"be",	"bf",	"bg",	"bh",
+"bi",	"bj",	"bk",	"bl",	"bm",	"bn",	"bo",	"bp",	"bq",	"br",
+"bs",	"bt",	"bu",	"bv",	"bw",	"bx",	"by",	"bz",	"ca",	"cb",
+"cc",	"cd",	"ce",	"cf",	"cg",	"ch",	"ci",	"cj",	"ck",	"cl",
+"cm",	"cn",	"co",	"cp",	"cq",	"cr",	"cs",	"ct",	"cu",	"cv",
+"cw",	"cx",	"cy",	"cz",	"da",	"db",	"dc",	"dd",	"de",	"df",
+"dg",	"dh",	"di",	"dj",	"dk",	"dl",	"dm",	"dn",	"do",	"dp",
+"dq",	"dr",	"ds",	"dt",	"du",	"dv",	"dw",	"dx",	"dy",	"dz",
+"ea",	"eb",	"ec",	"ed",	"ee",	"ef",	"eg",	"eh",	"ei",	"ej",
+"ek",	"el",	"em",	"en",	"eo",	"ep",	"eq",	"er",	"es",	"et",
+"eu",	"ev",	"ew",	"ex",	"ey",	"ez",	"fa",	"fb",	"fc",	"fd",
+"fe",	"ff",	"fg",	"fh",	"fi",	"fj",	"fk",	"fl",	"fm",	"fn",
+"fo",	"fp",	"fq",	"fr",	"fs",	"ft",	"fu",	"fv",	"fw",	"fx",
+"fy",	"fz",	"ga",	"gb",	"gc",	"gd",	"ge",	"gf",	"gg",	"gh",
+"gi",	"gj",	"gk",	"gl",	"gm",	"gn",	"go",	"gp",	"gq",	"gr",
+"gs",	"gt",	"gu",	"gv",	"gw",	"gx",	"gy",	"gz",	"ha",	"hb",
+"hc",	"hd",	"he",	"hf",	"hg",	"hh",	"hi",	"hj",	"hk",	"hl",
+"hm",	"hn",	"ho",	"hp",	"hq",	"hr",	"hs",	"ht",	"hu",	"hv",
+"hw",	"hx",	"hy",	"hz",	"ia",	"ib",	"ic",	"id",	"ie",	"if",
+"ig",	"ih",	"ii",	"ij",	"ik",	"il",	"im",	"in",	"io",	"ip",
+"iq",	"ir",	"is",	"it",	"iu",	"iv",	"iw",	"ix",	"iy",	"iz",
+"ja",	"jb",	"jc",	"jd",	"je",	"jf",	"jg",	"jh",	"ji",	"jj",
+"jk",	"jl",	"jm",	"jn",	"jo",	"jp",	"jq",	"jr",	"js",	"jt",
+"ju",	"jv",	"jw",	"jx",	"jy",	"jz",	"ka",	"kb",	"kc",	"kd",
+"ke",	"kf",	"kg",	"kh",	"ki",	"kj",	"kk",	"kl",	"km",	"kn",
+"ko",	"kp",	"kq",	"kr",	"ks",	"kt",	"ku",	"kv",	"kw",	"kx",
+"ky",	"kz",	"la",	"lb",	"lc",	"ld",	"le",	"lf",	"lg",	"lh",
+"li",	"lj",	"lk",	"ll",	"lm",	"ln",	"lo",	"lp",	"lq",	"lr",
+"ls",	"lt",	"lu",	"lv",	"lw",	"lx",	"ly",	"lz",	"ma",	"mb",
+"mc",	"md",	"me",	"mf",	"mg",	"mh",	"mi",	"mj",	"mk",	"ml",
+"mm",	"mn",	"mo",	"mp",	"mq",	"mr",	"ms",	"mt",	"mu",	"mv",
+"mw",	"mx",	"my",	"mz",	"na",	"nb",	"nc",	"nd",	"ne",	"nf",
+"ng",	"nh",	"ni",	"nj",	"nk",	"nl",	"nm",	"nn",	"no",	"np",
+"nq",	"nr",	"ns",	"nt",	"nu",	"nv",	"nw",	"nx",	"ny",	"nz",
+"oa",	"ob",	"oc",	"od",	"oe",	"of",	"og",	"oh",	"oi",	"oj",
+"ok",	"ol",	"om",	"on",	"oo",	"op",	"oq",	"or",	"os",	"ot",
+"ou",	"ov",	"ow",	"ox",	"oy",	"oz",	"pa",	"pb",	"pc",	"pd",
+"pe",	"pf",	"pg",	"ph",	"pi",	"pj",	"pk",	"pl",	"pm",	"pn",
+"po",	"pp",	"pq",	"pr",	"ps",	"pt",	"pu",	"pv",	"pw",	"px",
+"py",	"pz",	"qa",	"qb",	"qc",	"qd",	"qe",	"qf",	"qg",	"qh",
+"qi",	"qj",	"qk",	"ql",	"qm",	"qn",	"qo",	"qp",	"qq",	"qr",
+"qs",	"qt",	"qu",	"qv",	"qw",	"qx",	"qy",	"qz",	"ra",	"rb",
+"rc",	"rd",	"re",	"rf",	"rg",	"rh",	"ri",	"rj",	"rk",	"rl",
+"rm",	"rn",	"ro",	"rp",	"rq",	"rr",	"rs",	"rt",	"ru",	"rv",
+"rw",	"rx",	"ry",	"rz",	"sa",	"sb",	"sc",	"sd",	"se",	"sf",
+"sg",	"sh",	"si",	"sj",	"sk",	"sl",	"sm",	"sn",	"so",	"sp",
+"sq",	"sr",	"ss",	"st",	"su",	"sv",	"sw",	"sx",	"sy",	"sz",
+"ta",	"tb",	"tc",	"td",	"te",	"tf",	"tg",	"th",	"ti",	"tj",
+"tk",	"tl",	"tm",	"tn",	"to",	"tp",	"tq",	"tr",	"ts",	"tt",
+"tu",	"tv",	"tw",	"tx",	"ty",	"tz",	"ua",	"ub",	"uc",	"ud",
+"ue",	"uf",	"ug",	"uh",	"ui",	"uj",	"uk",	"ul",	"um",	"un",
+"uo",	"up",	"uq",	"ur",	"us",	"ut",	"uu",	"uv",	"uw",	"ux",
+"uy",	"uz",	"va",	"vb",	"vc",	"vd",	"ve",	"vf",	"vg",	"vh",
+"vi",	"vj",	"vk",	"vl",	"vm",	"vn",	"vo",	"vp",	"vq",	"vr",
+"vs",	"vt",	"vu",	"vv",	"vw",	"vx",	"vy",	"vz",	"wa",	"wb",
+"wc",	"wd",	"we",	"wf",	"wg",	"wh",	"wi",	"wj",	"wk",	"wl",
+"wm",	"wn",	"wo",	"wp",	"wq",	"wr",	"ws",	"wt",	"wu",	"wv",
+"ww",	"wx",	"wy",	"wz",	"xa",	"xb",	"xc",	"xd",	"xe",	"xf",
+"xg",	"xh",	"xi",	"xj",	"xk",	"xl",	"xm",	"xn",	"xo",	"xp",
+"xq",	"xr",	"xs",	"xt",	"xu",	"xv",	"xw",	"xx",	"xy",	"xz",
+"ya",	"yb",	"yc",	"yd",	"ye",	"yf",	"yg",	"yh",	"yi",	"yj",
+"yk",	"yl",	"ym",	"yn",	"yo",	"yp",	"yq",	"yr",	"ys",	"yt",
+"yu",	"yv",	"yw",	"yx",	"yy",	"yz",	"za",	"zb",	"zc",	"zd",
+"ze",	"zf",	"zg",	"zh",	"zi",	"zj",	"zk",	"zl",	"zm",	"zn",
+"zo",	"zp",	"zq",	"zr",	"zs",	"zt",	"zu",	"zv",	"zw",	"zx",
+"zy",	"zz",	"aaa",	"aab",	"aac",	"aad",	"aae",	"aaf",	"aag",	"aah",
+"aai",	"aaj",	"aak",	"aal",	"aam",	"aan",	"aao",	"aap",	"aaq",	"aar",
+"aas",	"aat",	"aau",	"aav",	"aaw",	"aax",	"aay",	"aaz",	"aba",	"abb",
+"abc",	"abd",	"abe",	"abf",	"abg",	"abh",	"abi",	"abj",	"abk",	"abl",
+"abm",	"abn",	"abo",	"abp",	"abq",	"abr",	"abs",	"abt",	"abu",	"abv",
+"abw",	"abx",	"aby",	"abz",	"aca",	"acb",	"acc",	"acd",	"ace",	"acf",
+"acg",	"ach",	"aci",	"acj",	"ack",	"acl",	"acm",	"acn",	"aco",	"acp",
+"acq",	"acr",	"acs",	"act",	"acu",	"acv",	"acw",	"acx",	"acy",	"acz",
+"ada",	"adb",	"adc",	"add",	"ade",	"adf",	"adg",	"adh",	"adi",	"adj",
+"adk",	"adl",	"adm",	"adn",	"ado",	"adp",	"adq",	"adr",	"ads",	"adt",
+"adu",	"adv",	"adw",	"adx",	"ady",	"adz",	"aea",	"aeb",	"aec",	"aed",
+"aee",	"aef",	"aeg",	"aeh",	"aei",	"aej",	"aek",	"ael",	"aem",	"aen",
+"aeo",	"aep",	"aeq",	"aer",	"aes",	"aet",	"aeu",	"aev",	"aew",	"aex",
+"aey",	"aez",	"afa",	"afb",	"afc",	"afd",	"afe",	"aff",	"afg",	"afh",
+"afi",	"afj",	"afk",	"afl",	"afm",	"afn",	"afo",	"afp",	"afq",	"afr",
+"afs",	"aft",	"afu",	"afv",	"afw",	"afx",	"afy",	"afz",	"aga",	"agb",
+"agc",	"agd",	"age",	"agf",	"agg",	"agh",	"agi",	"agj",	"agk",	"agl",
+"agm",	"agn",	"ago",	"agp",	"agq",	"agr",	"ags",	"agt",	"agu",	"agv",
+"agw",	"agx",	"agy",	"agz",	"aha",	"ahb",	"ahc",	"ahd",	"ahe",	"ahf",
+"ahg",	"ahh",	"ahi",	"ahj",	"ahk",	"ahl",	"ahm",	"ahn",	"aho",	"ahp",
+"ahq",	"ahr",	"ahs",	"aht",	"ahu",	"ahv",	"ahw",	"ahx",	"ahy",	"ahz",
+"aia",	"aib",	"aic",	"aid",	"aie",	"aif",	"aig",	"aih",	"aii",	"aij",
+"aik",	"ail",	"aim",	"ain",	"aio",	"aip",	"aiq",	"air",	"ais",	"ait",
+"aiu",	"aiv",	"aiw",	"aix",	"aiy",	"aiz",	"aja",	"ajb",	"ajc",	"ajd",
+"aje",	"ajf",	"ajg",	"ajh",	"aji",	"ajj",	"ajk",	"ajl",	"ajm",	"ajn",
+"ajo",	"ajp",	"ajq",	"ajr",	"ajs",	"ajt",	"aju",	"ajv",	"ajw",	"ajx",
+"ajy",	"ajz",	"aka",	"akb",	"akc",	"akd",	"ake",	"akf",	"akg",	"akh",
+"aki",	"akj",	"akk",	"akl",	"akm",	"akn",	"ako",	"akp",	"akq",	"akr",
+"aks",	"akt",	"aku",	"akv",	"akw",	"akx",	"aky",	"akz",	"ala",	"alb",
+"alc",	"ald",	"ale",	"alf",	"alg",	"alh",	"ali",	"alj",	"alk",	"all",
+};
diff --git a/performance/lmbench3/src/par_mem.c b/performance/lmbench3/src/par_mem.c
new file mode 100644
index 0000000..2bb78e6
--- /dev/null
+++ b/performance/lmbench3/src/par_mem.c
@@ -0,0 +1,81 @@
+/*
+ * par_mem.c - determine the memory hierarchy parallelism
+ *
+ * usage: par_mem [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+void compute_times(struct mem_state* state, double* tlb_time, double* cache_time);
+
+
+/*
+ * Assumptions:
+ *
+ * 1) Cache lines are a multiple of pointer-size words
+ * 2) Cache lines are no larger than 1/8 of a page (typically 512 bytes)
+ * 3) Pages are an even multiple of cache lines
+ */
+int
+main(int ac, char **av)
+{
+	int	i;
+	int	c;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int	print_cost = 0;
+	size_t	len;
+	size_t	maxlen = 64 * 1024 * 1024;
+	double	par;
+	struct mem_state state;
+	char   *usage = "[-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]\n";
+
+	state.line = getpagesize() / 16;
+	state.pagesize = getpagesize();
+
+	while (( c = getopt(ac, av, "cL:M:W:N:")) != EOF) {
+		switch(c) {
+		case 'c':
+			print_cost = 1;
+			break;
+		case 'L':
+			state.line = atoi(optarg);
+			if (state.line < sizeof(char*))
+				state.line = sizeof(char*);
+			break;
+		case 'M':
+			maxlen = bytes(optarg);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	for (i = MAX_MEM_PARALLELISM * state.line; i <= maxlen; i<<=1) { 
+		par = par_mem(i, warmup, repetitions, &state);
+
+		if (par > 0.) {
+			fprintf(stderr, "%.6f %.2f\n", 
+				i / (1000. * 1000.), par);
+		}
+	}
+
+	exit(0);
+}
+
+
diff --git a/performance/lmbench3/src/par_ops.c b/performance/lmbench3/src/par_ops.c
new file mode 100644
index 0000000..1b79615
--- /dev/null
+++ b/performance/lmbench3/src/par_ops.c
@@ -0,0 +1,501 @@
+/*
+ * par_ops.c - benchmark of simple operation parallelism
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+struct _state {
+	int	N;
+	int	M;
+	int	K;
+	int*	int_data;
+	double*	double_data;
+};
+
+void	initialize(iter_t iterations, void* cookie);
+void	cleanup(iter_t iterations, void* cookie);
+
+#define	FIVE(m)		m m m m m
+#define	TEN(m)		FIVE(m) FIVE(m)
+#define	FIFTY(m)	TEN(m) TEN(m) TEN(m) TEN(m) TEN(m)
+#define	HUNDRED(m)	FIFTY(m) FIFTY(m)
+
+#define MAX_LOAD_PARALLELISM 16
+
+double
+max_parallelism(benchmp_f* benchmarks, 
+		int warmup, int repetitions, void* cookie)
+{
+	int		i, j, k;
+	double		baseline, max_load_parallelism, load_parallelism;
+	result_t	*results, *r_save;
+
+	max_load_parallelism = 1.;
+
+	for (i = 0; i < MAX_LOAD_PARALLELISM; ++i) {
+		benchmp(initialize, benchmarks[i], cleanup, 
+			0, 1, warmup, repetitions, cookie);
+		save_minimum();
+
+		if (gettime() == 0)
+			return -1.;
+
+		if (i == 0) {
+			baseline = (double)gettime() / (double)get_n();
+		} else {
+			load_parallelism = baseline;
+			load_parallelism /= (double)gettime();
+			load_parallelism *= (double)((i + 1) * get_n());
+			if (load_parallelism > max_load_parallelism) {
+				max_load_parallelism = load_parallelism;
+			}
+		}
+	}
+	return max_load_parallelism;
+}
+
+#define REPEAT_0(m)	m(0)
+#define REPEAT_1(m)	REPEAT_0(m) m(1)
+#define REPEAT_2(m)	REPEAT_1(m) m(2)
+#define REPEAT_3(m)	REPEAT_2(m) m(3)
+#define REPEAT_4(m)	REPEAT_3(m) m(4)
+#define REPEAT_5(m)	REPEAT_4(m) m(5)
+#define REPEAT_6(m)	REPEAT_5(m) m(6)
+#define REPEAT_7(m)	REPEAT_6(m) m(7)
+#define REPEAT_8(m)	REPEAT_7(m) m(8)
+#define REPEAT_9(m)	REPEAT_8(m) m(9)
+#define REPEAT_10(m)	REPEAT_9(m) m(10)
+#define REPEAT_11(m)	REPEAT_10(m) m(11)
+#define REPEAT_12(m)	REPEAT_11(m) m(12)
+#define REPEAT_13(m)	REPEAT_12(m) m(13)
+#define REPEAT_14(m)	REPEAT_13(m) m(14)
+#define REPEAT_15(m)	REPEAT_14(m) m(15)
+
+#define BENCHMARK(benchmark,N,repeat)					\
+void benchmark##_##N(iter_t iterations, void *cookie) 			\
+{									\
+	register iter_t i = iterations;					\
+	struct _state* state = (struct _state*)cookie;			\
+	repeat(DECLARE);						\
+									\
+	repeat(INIT);							\
+	while (i-- > 0) {						\
+		repeat(PREAMBLE);					\
+		TEN(repeat(BODY));					\
+	}								\
+									\
+	repeat(SAVE);							\
+}
+
+#define PARALLEL_BENCHMARKS(benchmark)					\
+	BENCHMARK(benchmark, 0, REPEAT_0)				\
+	BENCHMARK(benchmark, 1, REPEAT_1)				\
+	BENCHMARK(benchmark, 2, REPEAT_2)				\
+	BENCHMARK(benchmark, 3, REPEAT_3)				\
+	BENCHMARK(benchmark, 4, REPEAT_4)				\
+	BENCHMARK(benchmark, 5, REPEAT_5)				\
+	BENCHMARK(benchmark, 6, REPEAT_6)				\
+	BENCHMARK(benchmark, 7, REPEAT_7)				\
+	BENCHMARK(benchmark, 8, REPEAT_8)				\
+	BENCHMARK(benchmark, 9, REPEAT_9)				\
+	BENCHMARK(benchmark, 10, REPEAT_10)				\
+	BENCHMARK(benchmark, 11, REPEAT_11)				\
+	BENCHMARK(benchmark, 12, REPEAT_12)				\
+	BENCHMARK(benchmark, 13, REPEAT_13)				\
+	BENCHMARK(benchmark, 14, REPEAT_14)				\
+	BENCHMARK(benchmark, 15, REPEAT_15)				\
+									\
+	benchmp_f benchmark##_benchmarks[] = {				\
+		benchmark##_0,						\
+		benchmark##_1,						\
+		benchmark##_2,						\
+		benchmark##_3,						\
+		benchmark##_4,						\
+		benchmark##_5,						\
+		benchmark##_6,						\
+		benchmark##_7,						\
+		benchmark##_8,						\
+		benchmark##_9,						\
+		benchmark##_10,						\
+		benchmark##_11,						\
+		benchmark##_12,						\
+		benchmark##_13,						\
+		benchmark##_14,						\
+		benchmark##_15						\
+	};
+
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N ^= s##N; s##N ^= r##N; r##N |= s##N;
+#define DECLARE(N)	register int r##N, s##N;
+#define INIT(N)		r##N = state->int_data[N] + 1; s##N = (N+1) + r##N;
+#define PREAMBLE(N)	
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(integer_bit)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		a##N += b##N; b##N -= a##N;
+#define DECLARE(N)	register int a##N, b##N;
+#define INIT(N)		a##N = state->int_data[N] + 57; \
+			a##N = state->int_data[N] + 31;
+#define PREAMBLE(N)
+#define SAVE(N)		use_int(a##N + b##N);
+PARALLEL_BENCHMARKS(integer_add)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N *= s##N;
+#define DECLARE(N)	register int r##N, s##N, t##N;
+#define INIT(N)		r##N = state->int_data[N] - N + 1 + 37431; \
+			s##N = state->int_data[N] - N + 1 + 4; \
+			t##N = r##N * s##N * s##N * s##N * s##N * s##N * \
+				s##N * s##N * s##N * s##N * s##N - r##N; \
+			r##N += t##N;
+#define PREAMBLE(N)	r##N -= t##N;
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(integer_mul)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N = (s##N / r##N);
+#define DECLARE(N)	register int r##N, s##N;
+#define INIT(N)		r##N = state->int_data[N] - N + 1 + 36; \
+			s##N = (r##N + 1) << 20;
+#define PREAMBLE(N)	
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(integer_div)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N %= s##N; r##N |= s##N;
+#define DECLARE(N)	register int r##N, s##N;
+#define INIT(N)		r##N = state->int_data[N] - N + 1 + iterations; \
+			s##N = state->int_data[N] - N + 1 + 62;
+#define PREAMBLE(N)	
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(integer_mod)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N ^= i##N; s##N ^= r##N; r##N |= s##N;
+#define DECLARE(N)	register int64 r##N, s##N, i##N;
+#define INIT(N)		r##N = state->int_data[N] - N + 1; \
+			r##N |= r##N << 32; \
+			s##N = iterations + state->int_data[N] - N + 1; \
+			s##N |= s##N << 32; \
+			i##N = (s##N << 2) - (int64)1;
+#define PREAMBLE(N)	i##N -= 1;
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(int64_bit)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		a##N += b##N; b##N -= a##N;
+#define DECLARE(N)	register int64 a##N, b##N;
+#define INIT(N)		a##N = state->int_data[N] - N + 1 + 37420; \
+			a##N += (int64)(0xFE + state->int_data[N] - N + 1)<<30; \
+			b##N = state->int_data[N] - N + 1 + 21698324; \
+			b##N += (int64)(0xFFFE + state->int_data[N] - N + 1)<<29;
+#define PREAMBLE(N)
+#define SAVE(N)		use_int((int)a##N + (int)b##N);
+PARALLEL_BENCHMARKS(int64_add)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N = (r##N * s##N);
+#define DECLARE(N)	register int64 r##N, s##N, t##N;
+#define INIT(N)		r##N = state->int_data[N] - N + 1 + 37420; \
+			r##N += (int64)(state->int_data[N] - N + 1 + 6)<<32; \
+			s##N = state->int_data[N] - N + 1 + 4; \
+			t##N = r##N * s##N * s##N * s##N * s##N * s##N * \
+				s##N * s##N * s##N * s##N * s##N - r##N; \
+			r##N += t##N;
+#define PREAMBLE(N)	r##N -= t##N;
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(int64_mul)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N = (s##N / r##N);
+#define DECLARE(N)	register int64 r##N, s##N;
+#define INIT(N)		r##N = state->int_data[N] - N + 37; \
+			r##N += r##N << 33; \
+			s##N = (r##N + 17) << 13;
+#define PREAMBLE(N)	
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(int64_div)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N = (s##N % r##N) ^ r##N;
+#define DECLARE(N)	register int64 r##N, s##N;
+#define INIT(N)		r##N = (int64)state->int_data[N]; s##N = 0;
+#define PREAMBLE(N)	s##N++;	
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(int64_mod)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N += r##N;
+#define DECLARE(N)	register float r##N, s##N;
+#define INIT(N)		r##N = (float)state->double_data[N] + 1023.0; \
+			s##N = (float)state->K;
+#define PREAMBLE(N)	r##N += s##N;
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(float_add)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N *= r##N; r##N *= s##N;
+#define DECLARE(N)	register float r##N, s##N;
+#define INIT(N)		r##N = 8.0f * (float)state->double_data[N]; \
+			s##N = 0.125 * (float)state->M * state->double_data[N] / 1000.0;
+#define PREAMBLE(N)
+#define SAVE(N)		use_int((int)r##N); use_int((int)s##N);
+PARALLEL_BENCHMARKS(float_mul)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N = s##N / r##N;
+#define DECLARE(N)	register float r##N, s##N;
+#define INIT(N)		r##N = 1.41421356f * (float)state->double_data[N]; \
+			s##N = 3.14159265f * (float)(state->int_data[N] - N + 1);
+#define PREAMBLE(N)
+#define SAVE(N)		use_int((int)r##N); use_int((int)s##N);
+PARALLEL_BENCHMARKS(float_div)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N += r##N;
+#define DECLARE(N)	register double r##N, s##N;
+#define INIT(N)		r##N = state->double_data[N] + 1023.; \
+			s##N = (double)state->K;
+#define PREAMBLE(N)	r##N += s##N;
+#define SAVE(N)		use_int((int)r##N);
+PARALLEL_BENCHMARKS(double_add)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N *= r##N; r##N *= s##N;
+#define DECLARE(N)	register double r##N, s##N;
+#define INIT(N)		r##N = 8.0f * state->double_data[N]; \
+			s##N = 0.125 * (double)state->M * state->double_data[N] / 1000.0;
+#define PREAMBLE(N)	
+#define SAVE(N)		use_int((int)r##N); use_int((int)s##N);
+PARALLEL_BENCHMARKS(double_mul)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+#define BODY(N)		r##N = s##N / r##N;
+#define DECLARE(N)	register double r##N, s##N;
+#define INIT(N)		r##N = 1.41421356 * state->double_data[N]; \
+			s##N = 3.14159265 * (double)(state->int_data[N] - N + 1);
+#define PREAMBLE(N)	
+#define SAVE(N)		use_int((int)r##N); use_int((int)s##N);
+PARALLEL_BENCHMARKS(double_div)
+#undef	BODY
+#undef	DECLARE
+#undef	INIT
+#undef	PREAMBLE
+#undef	SAVE
+
+
+void
+initialize(iter_t iterations, void* cookie)
+{
+	struct _state *state = (struct _state*)cookie;
+	register int i;
+
+	if (iterations) return;
+
+	state->int_data = (int*)malloc(MAX_LOAD_PARALLELISM * sizeof(int));
+	state->double_data = (double*)malloc(MAX_LOAD_PARALLELISM * sizeof(double));
+
+	for (i = 0; i < MAX_LOAD_PARALLELISM; ++i) {
+		state->int_data[i] = i+1;
+		state->double_data[i] = 1.;
+	}
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	struct _state *state = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	free(state->int_data);
+	free(state->double_data);
+}
+
+
+int
+main(int ac, char **av)
+{
+	int	c;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	double	par;
+	struct _state	state;
+	char   *usage = "[-W <warmup>] [-N <repetitions>]\n";
+
+	state.N = 1;
+	state.M = 1000;
+	state.K = -1023;
+
+	while (( c = getopt(ac, av, "W:N:")) != EOF) {
+		switch(c) {
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	par = max_parallelism(integer_bit_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "integer bit parallelism: %.2f\n", par);
+
+	par = max_parallelism(integer_add_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "integer add parallelism: %.2f\n", par);
+
+	par = max_parallelism(integer_mul_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "integer mul parallelism: %.2f\n", par);
+
+	par = max_parallelism(integer_div_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "integer div parallelism: %.2f\n", par);
+
+	par = max_parallelism(integer_mod_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "integer mod parallelism: %.2f\n", par);
+
+	par = max_parallelism(int64_bit_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "int64 bit parallelism: %.2f\n", par);
+
+	par = max_parallelism(int64_add_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "int64 add parallelism: %.2f\n", par);
+
+	par = max_parallelism(int64_mul_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "int64 mul parallelism: %.2f\n", par);
+
+	par = max_parallelism(int64_div_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "int64 div parallelism: %.2f\n", par);
+
+	par = max_parallelism(int64_mod_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "int64 mod parallelism: %.2f\n", par);
+
+	par = max_parallelism(float_add_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "float add parallelism: %.2f\n", par);
+
+	par = max_parallelism(float_mul_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "float mul parallelism: %.2f\n", par);
+
+	par = max_parallelism(float_div_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "float div parallelism: %.2f\n", par);
+
+	par = max_parallelism(double_add_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "double add parallelism: %.2f\n", par);
+
+	par = max_parallelism(double_mul_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "double mul parallelism: %.2f\n", par);
+
+	par = max_parallelism(double_div_benchmarks, 
+			      warmup, repetitions, &state);
+	if (par > 0.)
+		fprintf(stderr, "double div parallelism: %.2f\n", par);
+
+
+	return(0);
+}
+
diff --git a/performance/lmbench3/src/rhttp.c b/performance/lmbench3/src/rhttp.c
new file mode 100644
index 0000000..0213050
--- /dev/null
+++ b/performance/lmbench3/src/rhttp.c
@@ -0,0 +1,125 @@
+/*
+ * rhttp.c - simple HTTP transaction latency test
+ *
+ * usage: rhttp hostname [port] remote-clients -p file file 
+ *
+ * This turns into a bunch of 
+ *	rsh remote http hostname file file file [port]
+ * with the results aggragated and reported.
+ *
+ * The program "http" must be in your path on the remote machine.
+ *
+ * XXX - the way this should work is like so:
+ *	parent process reading file names from stdin
+ *	multiple child processes connected to the parent process
+ *	while more file names
+ *		wait for a child process to be idle
+ *		feed it ~10 filenames
+ *	the child processes need to be able to tell the parent that they
+ *	want more work.  They also need to pass back the results.
+ *
+ * Copyright (c) 1994-1997 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Silicon Graphics is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+int
+main(int ac, char **av)
+{
+	char	*name = av[0], *server, *prog;
+	int     i, j;
+	uint64	total = 0;
+	uint64	usecs = 0;
+	char	*args[1024];
+
+	if (ac < 5) {
+usage:		fprintf(stderr,
+		    "Usage: %s hostname [port] remote-clients -p file ...\n",
+		    name);
+		exit(1);
+	}
+	server = av[1];
+	av++, ac--;	/* eat server */
+	if (atoi(av[1]) != 0) {
+		prog = av[1];
+		av++, ac--;	/* eat port */
+	} else {
+		prog = "80";	/* http */
+	}
+	for (i = 1; i < ac; ++i) {
+		if (!strcmp("-p", av[i])) {
+			i++;
+			break;
+		}
+	}
+	args[0] = "rsh";
+	args[2] = "http";
+	args[3] = server;
+	j = 4;
+	while (i < ac) {
+		args[j++] = av[i++];
+	}
+	args[j++] = prog;
+	args[j] = 0;
+	for (i = 1; i < ac; ++i) {
+		if (!strcmp("-p", av[i])) {
+			break;
+		}
+		args[1] = av[i];
+		for (j = 0; args[j]; j++) {
+			printf("%s ", args[j]);
+		}
+		printf("\n");
+		if (fork() == 0) {
+			char	name[30];
+
+			sprintf(name, "/tmp/rhttp%d", i);
+			creat(name, 0666);
+			close(2);
+			dup(1);
+			execvp(args[0], args);
+			perror(args[0]);
+			exit(1);
+		}
+	}
+	for (i = 1; i < ac; ++i) {
+		if (!strcmp("-p", av[i])) {
+			break;
+		}
+		wait(0);
+	}
+	system("cat /tmp/rhttp*; rm /tmp/rhttp*"); 
+	exit(1);
+	for (i = 1; i < ac; ++i) {
+		int	fd, n, m = 0;
+		float	f1 = 0, f2 = 0;
+		char	buf[30];
+
+		if (!strcmp("-p", av[i])) {
+			break;
+		}
+		sprintf(buf, "/tmp/http%d", i);
+		fd = open(buf, 0);
+		unlink(buf);
+		/* 
+		 * Avg xfer: 3.9KB, 235.0KB in 2038 millisecs, 115.31 KB/sec
+		 */
+		n = read(fd, buf, XFERSIZE);
+		buf[n] = 0;
+		sscanf(buf, "Avg xfer: %fKB, %fKB in %d millisecs,", 
+		    &f1, &f2, &m);
+		if (m > usecs) {
+			usecs = m;
+		}
+		total += f2;
+	}
+	total <<= 10;
+	usecs *= 1000;
+	settime(usecs);
+	latency((uint64)1, total);
+}
diff --git a/performance/lmbench3/src/seek.c b/performance/lmbench3/src/seek.c
new file mode 100644
index 0000000..b78b2a8
--- /dev/null
+++ b/performance/lmbench3/src/seek.c
@@ -0,0 +1,65 @@
+char	*id = "$Id$\n";
+/*
+ * Seek - calculate seeks as a function of distance.
+ *
+ * Usage: seek file size
+ *
+ * Copyright (c) 1994,1995,1996 Larry McVoy.  All rights reserved.
+ */
+
+#include	"bench.h"
+
+#define	STRIDE	1024*1024
+
+main(ac, av)
+	int	ac;
+	char  	*av[];
+{
+	char	buf[512];
+	int	disk;
+	off_t	size;
+	off_t	begin, end;
+	int	usecs;
+
+	if (ac != 3) {
+		exit(1);
+	}
+	if ((disk = open(av[1], 0)) == -1) {
+		exit(1);
+	}
+	size = atol(av[2]);
+	switch (av[2][strlen(av[2])-1]) {
+	    case 'k':	size <<= 10;		break;
+	    case 'K':	size *= 1000;		break;
+	    case 'm':	size <<= 20;		break;
+	    case 'M':	size *= 1000000;	break;
+	    case 'g':	size <<= 30;		break;
+	    case 'G':	size *= 1000000000L;	break;
+	}
+
+	/*
+	 * We flip back and forth, in strides of 1MB.
+	 * If we have a 100MB disk, that means we do
+	 * 1, 99, 2, 98, etc.
+	 */
+	end = size;
+	begin = 0;
+	lseek(disk, begin, 0);
+	read(disk, buf, sizeof(buf));
+	while (end > begin) {
+		end -= STRIDE;
+		start();
+		lseek(disk, end, 0);
+		read(disk, buf, sizeof(buf));
+		usecs = stop();
+		printf("%.04f %.04f\n", (end - begin) / 1000000., usecs/1000.);
+
+		begin += STRIDE;
+		start();
+		lseek(disk, begin, 0);
+		read(disk, buf, sizeof(buf));
+		usecs = stop();
+		printf("%.04f %.04f\n", (end - begin) / 1000000., usecs/1000.);
+	}
+	exit(0);
+}
diff --git a/performance/lmbench3/src/stats.h b/performance/lmbench3/src/stats.h
new file mode 100644
index 0000000..c355168
--- /dev/null
+++ b/performance/lmbench3/src/stats.h
@@ -0,0 +1,61 @@
+#ifndef _STATS_H
+#define _STATS_H
+
+#include "bench.h"
+#include "timing.h"
+
+#define ABS(x)	((x) < 0 ? -(x) : (x))
+
+int	int_compare(const void *a, const void *b);
+int	uint64_compare(const void *a, const void *b);
+int	double_compare(const void *a, const void *b);
+
+typedef	int (*int_stat)(int *values, int size);
+typedef	uint64 (*uint64_stat)(uint64 *values, int size);
+typedef	double (*double_stat)(double *values, int size);
+
+int	int_median(int *values, int size);
+uint64	uint64_median(uint64 *values, int size);
+double	double_median(double *values, int size);
+
+int	int_mean(int *values, int size);
+uint64	uint64_mean(uint64 *values, int size);
+double	double_mean(double *values, int size);
+
+int	int_min(int *values, int size);
+uint64	uint64_min(uint64 *values, int size);
+double	double_min(double *values, int size);
+
+int	int_max(int *values, int size);
+uint64	uint64_max(uint64 *values, int size);
+double	double_max(double *values, int size);
+
+double	int_variance(int *values, int size);
+double	uint64_variance(uint64 *values, int size);
+double	double_variance(double *values, int size);
+
+double	int_moment(int moment, int *values, int size);
+double	uint64_moment(int moment, uint64 *values, int size);
+double	double_moment(int moment, double *values, int size);
+
+double	int_stderr(int *values, int size);
+double	uint64_stderr(uint64 *values, int size);
+double	double_stderr(double *values, int size);
+
+double	int_skew(int *values, int size);
+double	uint64_skew(uint64 *values, int size);
+double	double_skew(double *values, int size);
+
+double	int_kurtosis(int *values, int size);
+double	uint64_kurtosis(uint64 *values, int size);
+double	double_kurtosis(double *values, int size);
+
+double	int_bootstrap_stderr(int *values, int size, int_stat f);
+double	uint64_bootstrap_stderr(uint64 *values, int size, uint64_stat f);
+double	double_bootstrap_stderr(double *values, int size, double_stat f);
+
+void	regression(double *x, double *y, double *sig, int n,
+		   double *a, double *b, double *sig_a, double *sig_b, 
+		   double *chi2);
+
+#endif /* _STATS_H */
diff --git a/performance/lmbench3/src/stream.c b/performance/lmbench3/src/stream.c
new file mode 100644
index 0000000..1202f32
--- /dev/null
+++ b/performance/lmbench3/src/stream.c
@@ -0,0 +1,309 @@
+/*
+ * steam.c - lmbench version of John McCalpin's STREAM benchmark
+ *
+ * usage: stream
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+struct _state {
+	double*	a;
+	double*	b;
+	double*	c;
+	double	scalar;
+	int	len;
+};
+
+void initialize(iter_t iterations, void* cookie);
+void cleanup(iter_t iterations, void* cookie);
+
+/* These are from STREAM version 1 */
+void copy(iter_t iterations, void* cookie);
+void scale(iter_t iterations, void* cookie);
+void add(iter_t iterations, void* cookie);
+void triad(iter_t iterations, void* cookie);
+
+/* These are from STREAM version 2 */
+void fill(iter_t iterations, void* cookie);
+/* NOTE: copy is the same as in version 1 */
+void daxpy(iter_t iterations, void* cookie);
+void sum(iter_t iterations, void* cookie);
+
+
+/*
+ * Assumptions:
+ *
+ * 1) Cache lines are a multiple of pointer-size words
+ * 2) Cache lines are no larger than 1/4 a page size
+ * 3) Pages are an even multiple of cache lines
+ */
+int
+main(int ac, char **av)
+{
+	int	i, j, l;
+	int	version = 1;
+	int	parallel = 1;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	int	c;
+	uint64	datasize;
+	struct _state state;
+	char   *p;
+	char   *usage = "[-v <stream version 1|2>] [-M <len>[K|M]] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n";
+
+        state.len = 1000 * 1000 * 3 * sizeof(double);
+	state.scalar = 3.0;
+
+	while (( c = getopt(ac, av, "v:M:P:W:N:")) != EOF) {
+		switch(c) {
+		case 'v':
+			version = atoi(optarg);
+			if (version != 1 && version != 2) 
+				lmbench_usage(ac, av, usage);
+			break;
+		case 'P':
+			parallel = atoi(optarg);
+			if (parallel <= 0) lmbench_usage(ac, av, usage);
+			break;
+		case 'M':
+			state.len = bytes(optarg);
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	/* ensure that we can malloc the desired space */
+	while (!(p = malloc(state.len)))
+		state.len /= 2;
+	free(p);
+		
+	/* convert from bytes to array length */
+	state.len /= 3 * sizeof(double);
+	datasize = sizeof(double) * state.len * parallel;
+
+	if (version == 1) {
+		benchmp(initialize, copy, cleanup, 
+			0, parallel, warmup, repetitions, &state);
+		if (gettime() > 0) {
+			if (parallel <= 1) save_minimum();
+			nano("STREAM copy latency", state.len * get_n());
+			fprintf(stderr, "STREAM copy bandwidth: ");
+			mb(2 * datasize * get_n());
+		}
+
+		benchmp(initialize, scale, cleanup, 
+			0, parallel, warmup, repetitions, &state);
+		if (gettime() > 0) {
+			if (parallel <= 1) save_minimum();
+			nano("STREAM scale latency", state.len * get_n());
+			fprintf(stderr, "STREAM scale bandwidth: ");
+			mb(2 * datasize * get_n());
+		}
+
+		benchmp(initialize, sum, cleanup, 
+			0, parallel, warmup, repetitions, &state);
+		if (gettime() > 0) {
+			if (parallel <= 1) save_minimum();
+			nano("STREAM add latency", state.len * get_n());
+			fprintf(stderr, "STREAM add bandwidth: ");
+			mb(3 * datasize * get_n());
+		}
+
+		benchmp(initialize, triad, cleanup, 
+			0, parallel, warmup, repetitions, &state);
+		if (gettime() > 0) {
+			if (parallel <= 1) save_minimum();
+			nano("STREAM triad latency", state.len * get_n());
+			fprintf(stderr, "STREAM triad bandwidth: ");
+			mb(3 * datasize * get_n());
+		}
+	} else {
+		benchmp(initialize, fill, cleanup, 
+			0, parallel, warmup, repetitions, &state);
+		if (gettime() > 0) {
+			if (parallel <= 1) save_minimum();
+			nano("STREAM2 fill latency", state.len * get_n());
+			fprintf(stderr, "STREAM2 fill bandwidth: ");
+			mb(datasize * get_n());
+		}
+
+		benchmp(initialize, copy, cleanup, 
+			0, parallel, warmup, repetitions, &state);
+		if (gettime() > 0) {
+			if (parallel <= 1) save_minimum();
+			nano("STREAM2 copy latency", state.len * get_n());
+			fprintf(stderr, "STREAM2 copy bandwidth: ");
+			mb(2 * datasize * get_n());
+		}
+
+		benchmp(initialize, daxpy, cleanup, 
+			0, parallel, warmup, repetitions, &state);
+		if (gettime() > 0) {
+			if (parallel <= 1) save_minimum();
+			nano("STREAM2 daxpy latency", state.len * get_n());
+			fprintf(stderr, "STREAM2 daxpy bandwidth: ");
+			mb(3 * datasize * get_n());
+		}
+
+		benchmp(initialize, sum, cleanup, 
+			0, parallel, warmup, repetitions, &state);
+		if (gettime() > 0) {
+			if (parallel <= 1) save_minimum();
+			nano("STREAM2 sum latency", state.len * get_n());
+			fprintf(stderr, "STREAM2 sum bandwidth: ");
+			mb(datasize * get_n());
+		}
+	}
+
+	return(0);
+}
+
+void
+initialize(iter_t iterations, void* cookie)
+{
+	int i;
+	struct _state* state = (struct _state*)cookie;
+	
+	if (iterations) return;
+
+	state->a = (double*)malloc(sizeof(double) * state->len);
+	state->b = (double*)malloc(sizeof(double) * state->len);
+	state->c = (double*)malloc(sizeof(double) * state->len);
+
+	if (state->a == NULL || state->b == NULL || state->c == NULL) {
+		exit(1);
+	}
+
+	for (i = 0; i < state->len; ++i) {
+		state->a[i] = 1.;
+		state->b[i] = 2.;
+		state->c[i] = 0.;
+	}
+}
+
+#define BODY(expr)							\
+{									\
+	register int i;							\
+	register int N = state->len;					\
+	register double* a = state->a;					\
+	register double* b = state->b;					\
+	register double* c = state->c;					\
+	register double scalar = state->scalar;				\
+									\
+	state->a = state->b;						\
+	state->b = state->c;						\
+	state->c = a;							\
+									\
+	for (i = 0; i < N; ++i) {					\
+		expr;							\
+	}								\
+}
+
+void
+copy(iter_t iterations, void *cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		BODY(c[i] = a[i];)
+	}
+}
+
+void
+scale(iter_t iterations, void *cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		BODY(b[i] = scalar * c[i];)
+	}
+}
+
+void
+add(iter_t iterations, void *cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		BODY(c[i] = a[i] + b[i];)
+	}
+}
+
+void
+triad(iter_t iterations, void *cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		BODY(a[i] = b[i] + scalar * c[i];)
+	}
+}
+
+/*
+ * STREAM version 2 benchmark kernels
+ *
+ * NOTE: copy is the same as version 1's benchmark
+ */
+void
+fill(iter_t iterations, void *cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		BODY(a[i] = 0;)
+	}
+}
+
+void
+daxpy(iter_t iterations, void *cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	while (iterations-- > 0) {
+		BODY(a[i] = a[i] + scalar * b[i];)
+	}
+}
+
+void
+sum(iter_t iterations, void *cookie)
+{
+	register double	s;
+	struct _state* state = (struct _state*)cookie;
+
+	s = 0.0;
+	while (iterations-- > 0) {
+		BODY(s += a[i];)
+	}
+	use_int((int)s);
+}
+
+void
+cleanup(iter_t iterations, void* cookie)
+{
+	struct _state* state = (struct _state*)cookie;
+
+	if (iterations) return;
+
+	free(state->a);
+	free(state->b);
+	free(state->c);
+}
+
+
+
diff --git a/performance/lmbench3/src/timing.h b/performance/lmbench3/src/timing.h
new file mode 100644
index 0000000..8757743
--- /dev/null
+++ b/performance/lmbench3/src/timing.h
@@ -0,0 +1,52 @@
+/*
+ * $Id$
+ */
+#ifndef _TIMING_H
+#define _TIMING_H
+
+char	*p64(uint64 big);
+char	*p64sz(uint64 big);
+double	Delta(void);
+double	Now(void);
+void	adjust(int usec);
+void	bandwidth(uint64 bytes, uint64 times, int verbose);
+uint64	bytes(char *s);
+void	context(uint64 xfers);
+uint64	delta(void);
+int	get_enough(int);
+uint64	get_n(void);
+void	kb(uint64 bytes);
+double	l_overhead(void);
+char	last(char *s);
+void	latency(uint64 xfers, uint64 size);
+void	mb(uint64 bytes);
+void	micro(char *s, uint64 n);
+void	micromb(uint64 mb, uint64 n);
+void	milli(char *s, uint64 n);
+void	morefds(void);
+void	nano(char *s, uint64 n);
+uint64	now(void);
+void	ptime(uint64 n);
+void	rusage(void);
+void	save_n(uint64);
+void	settime(uint64 usecs);
+void	start(struct timeval *tv);
+uint64	stop(struct timeval *begin, struct timeval *end);
+uint64	t_overhead(void);
+double	timespent(void);
+void	timing(FILE *out);
+uint64	tvdelta(struct timeval *, struct timeval *);
+void	tvsub(struct timeval *tdiff, struct timeval *t1, struct timeval *t0);
+void	use_int(int result);
+void	use_pointer(void *result);
+uint64	usecs_spent(void);
+void	touch(char *buf, int size);
+size_t*	permutation(int max, int scale);
+int	cp(char* src, char* dst, mode_t mode);
+long	bread(void* src, long count);
+
+#if defined(hpux) || defined(__hpux)
+int	getpagesize();
+#endif
+
+#endif /* _TIMING_H */
diff --git a/performance/lmbench3/src/timing_o.c b/performance/lmbench3/src/timing_o.c
new file mode 100644
index 0000000..7b9a42a
--- /dev/null
+++ b/performance/lmbench3/src/timing_o.c
@@ -0,0 +1,10 @@
+#include <stdio.h>
+#include "bench.h"
+
+int
+main()
+{
+	putenv("LOOP_O=0.0");
+	printf("%lu\n", (unsigned long)t_overhead());
+	return (0);
+}
diff --git a/performance/lmbench3/src/tlb.c b/performance/lmbench3/src/tlb.c
new file mode 100644
index 0000000..5ad13c9
--- /dev/null
+++ b/performance/lmbench3/src/tlb.c
@@ -0,0 +1,178 @@
+/*
+ * tlb.c - guess the cache line size
+ *
+ * usage: tlb [-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]
+ *
+ * Copyright (c) 2000 Carl Staelin.
+ * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
+ * additional restriction that results may published only if
+ * (1) the benchmark is unmodified, and
+ * (2) the version in the sccsid below is included in the report.
+ * Support for this development by Sun Microsystems is gratefully acknowledged.
+ */
+char	*id = "$Id$\n";
+
+#include "bench.h"
+
+int find_tlb(int start, int maxpages, int warmup, int repetitions, 
+	     double* tlb_time, double* cache_time, struct mem_state* state);
+void compute_times(int pages, int warmup, int repetitions,
+	     double* tlb_time, double* cache_time, struct mem_state* state);
+
+#define THRESHOLD 1.15
+
+/*
+ * Assumptions:
+ *
+ * 1) Cache lines are a multiple of pointer-size words
+ * 2) Cache lines no larger than 1/8 a page size
+ * 3) Pages are an even multiple of cache lines
+ */
+int
+main(int ac, char **av)
+{
+	int	i, l, len, tlb, maxpages;
+	int	c;
+	int	print_cost = 0;
+	int	warmup = 0;
+	int	repetitions = TRIES;
+	double	tlb_time, cache_time, diff;
+	struct mem_state state;
+	char   *usage = "[-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]\n";
+
+	maxpages = 16 * 1024;
+	state.width = 1;
+	state.pagesize = getpagesize();
+	state.line = sizeof(char*);
+
+	tlb = 2;
+
+	while (( c = getopt(ac, av, "cL:M:W:N:")) != EOF) {
+		switch(c) {
+		case 'c':
+			print_cost = 1;
+			break;
+		case 'L':
+			state.line = atoi(optarg);
+			break;
+		case 'M':
+			maxpages = bytes(optarg);	/* max in bytes */
+			maxpages /= getpagesize();	/* max in pages */
+			break;
+		case 'W':
+			warmup = atoi(optarg);
+			break;
+		case 'N':
+			repetitions = atoi(optarg);
+			break;
+		default:
+			lmbench_usage(ac, av, usage);
+			break;
+		}
+	}
+
+	/* assumption: no TLB will have less than 16 entries */
+	tlb = find_tlb(8, maxpages, warmup, repetitions, &tlb_time, &cache_time, &state);
+
+	if (tlb > 0) {
+		if (print_cost) {
+			compute_times(tlb * 2, warmup, repetitions, &tlb_time, &cache_time, &state);
+			fprintf(stderr, "tlb: %d pages %.5f nanoseconds\n", tlb, tlb_time - cache_time);
+		} else {
+			fprintf(stderr, "tlb: %d pages\n", tlb);
+		}
+	}
+
+	/*
+	for (i = tlb<<1; i <= maxpages; i<<=1) {
+		compute_times(i, warmup, repetitions, &tlb_time, &cache_time, &state);
+	}
+	/**/
+
+	return(0);
+}
+
+int
+find_tlb(int start, int maxpages, int warmup, int repetitions,
+	 double* tlb_time, double* cache_time, struct mem_state* state)
+{
+	int	i, lower, upper;
+
+	for (i = start; i <= maxpages; i<<=1) {
+		compute_times(i, warmup, repetitions, tlb_time, cache_time, state);
+
+		if (*tlb_time / *cache_time > THRESHOLD) {
+			lower = i>>1;
+			upper = i;
+			i = lower + (upper - lower) / 2;
+			break;
+		}
+	}
+
+	/* we can't find any tlb effect */
+	if (i >= maxpages) {
+		state->len = 0;
+		return (0);
+	}
+
+	/* use a binary search to locate point at which TLB effects start */
+	while (lower + 1 < upper) {
+		compute_times(i, warmup, repetitions, tlb_time, cache_time, state);
+
+		if (*tlb_time / *cache_time > THRESHOLD) {
+			upper = i;
+		} else {
+			lower = i;
+		}
+		i = lower + (upper - lower) / 2;
+	}
+	return (lower);
+}
+
+void
+compute_times(int pages, int warmup, int repetitions,
+	 double* tlb_time, double* cache_time, struct mem_state* state)
+{
+	int i;
+	result_t tlb_results, cache_results, *r_save;
+
+	r_save = get_results();
+	insertinit(&tlb_results);
+	insertinit(&cache_results);
+
+	state->len = pages * state->pagesize;
+	state->maxlen = pages * state->pagesize;
+	tlb_initialize(0, state);
+	if (state->initialized) {
+		for (i = 0; i < TRIES; ++i) {
+			BENCH1(mem_benchmark_0(__n, state); __n = 1;, 0);
+			insertsort(gettime(), get_n(), &tlb_results);
+		}
+	}
+	tlb_cleanup(0, state);
+	
+	state->len = pages * state->line;
+	state->maxlen = pages * state->line;
+	mem_initialize(0, state);
+	if (state->initialized) {
+		for (i = 0; i < TRIES; ++i) {
+			BENCH1(mem_benchmark_0(__n, state); __n = 1;, 0);
+			insertsort(gettime(), get_n(), &cache_results);
+		}
+	}
+	mem_cleanup(0, state);
+
+	/* We want nanoseconds / load. */
+	set_results(&tlb_results);
+	*tlb_time = (1000. * (double)gettime()) / (100. * (double)get_n());
+
+	/* We want nanoseconds / load. */
+	set_results(&cache_results);
+	*cache_time = (1000. * (double)gettime()) / (100. * (double)get_n());
+	set_results(r_save);
+
+	/*
+	fprintf(stderr, "%d %.5f %.5f\n", pages, *tlb_time, *cache_time);
+	/**/
+}
+
diff --git a/performance/lmbench3/src/version.h b/performance/lmbench3/src/version.h
new file mode 100644
index 0000000..0dc306d
--- /dev/null
+++ b/performance/lmbench3/src/version.h
@@ -0,0 +1,2 @@
+#define	MAJOR	3
+#define	MINOR	-4	/* negative is alpha, it "increases" */
diff --git a/performance/lmbench3/src/webpage-lm.tar b/performance/lmbench3/src/webpage-lm.tar
new file mode 100644
index 0000000..1e5bc3b
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm.tar differ
diff --git a/performance/lmbench3/src/webpage-lm/URLS b/performance/lmbench3/src/webpage-lm/URLS
new file mode 100644
index 0000000..4f54841
--- /dev/null
+++ b/performance/lmbench3/src/webpage-lm/URLS
@@ -0,0 +1,14 @@
+./pictures/me-small.jpg
+./gifs/snow-bg2.jpg
+./gifs/rib_bar_wh.gif
+./gifs/spam-not.gif
+./gifs/pookline.gif
+./gifs/blueline
+./gifs/eyes.gif
+./gifs/eyesleft.gif
+./gifs/new.gif
+./gifs/line1.gif
+./gifs/cclip3.gif
+./gifs/sgi_logo.gif
+./index.html
+./URLS
diff --git a/performance/lmbench3/src/webpage-lm/gifs/blueline b/performance/lmbench3/src/webpage-lm/gifs/blueline
new file mode 100644
index 0000000..868d4fe
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/blueline differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/cclip3.gif b/performance/lmbench3/src/webpage-lm/gifs/cclip3.gif
new file mode 100644
index 0000000..4697447
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/cclip3.gif differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/eyes.gif b/performance/lmbench3/src/webpage-lm/gifs/eyes.gif
new file mode 100644
index 0000000..443bce7
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/eyes.gif differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/eyesleft.gif b/performance/lmbench3/src/webpage-lm/gifs/eyesleft.gif
new file mode 100644
index 0000000..6b5305b
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/eyesleft.gif differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/line1.gif b/performance/lmbench3/src/webpage-lm/gifs/line1.gif
new file mode 100644
index 0000000..a8de25e
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/line1.gif differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/new.gif b/performance/lmbench3/src/webpage-lm/gifs/new.gif
new file mode 100644
index 0000000..7df4823
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/new.gif differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/pookline.gif b/performance/lmbench3/src/webpage-lm/gifs/pookline.gif
new file mode 100644
index 0000000..593f7f3
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/pookline.gif differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/rib_bar_wh.gif b/performance/lmbench3/src/webpage-lm/gifs/rib_bar_wh.gif
new file mode 100644
index 0000000..02e55ff
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/rib_bar_wh.gif differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/sgi_logo.gif b/performance/lmbench3/src/webpage-lm/gifs/sgi_logo.gif
new file mode 100644
index 0000000..84baa47
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/sgi_logo.gif differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/snow-bg2.jpg b/performance/lmbench3/src/webpage-lm/gifs/snow-bg2.jpg
new file mode 100644
index 0000000..3748971
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/snow-bg2.jpg differ
diff --git a/performance/lmbench3/src/webpage-lm/gifs/spam-not.gif b/performance/lmbench3/src/webpage-lm/gifs/spam-not.gif
new file mode 100644
index 0000000..7e89689
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/gifs/spam-not.gif differ
diff --git a/performance/lmbench3/src/webpage-lm/index.html b/performance/lmbench3/src/webpage-lm/index.html
new file mode 100644
index 0000000..ed7ee98
--- /dev/null
+++ b/performance/lmbench3/src/webpage-lm/index.html
@@ -0,0 +1,253 @@
+<html>
+<body background=gifs/snow-bg2.jpg>
+<TITLE>Larry McVoy's home page</Title>
+
+<p align=center>
+<A HREF=http://www.eff.org> <img src="gifs/rib_bar_wh.gif"> </a>
+<A HREF=http://www.cauce.org> <img src="gifs/spam-not.gif"> </a>
+</p>
+<img src="gifs/pookline.gif">
+<img src="gifs/blueline">
+<H1 align=center>Larry McVoy's home page</H1>
+<img src="gifs/blueline">
+<img src="gifs/pookline.gif">
+<p align=center>
+<A HREF="pictures/me.jpg"">
+<img src="pictures/me-small.jpg"> </a>
+</p>
+<H1 align=center>
+Notice:  I'm moving to a new job.  New email is lm@xxxxxxx.
+</H1>
+</p>
+<H1 align=center>
+Who am I?
+</H1>
+<p>
+I'm an engineer for Silicon Graphics, working in the networking group.
+I spend most of my time waving my hands and convincing other people they 
+want to work on stuff that I think is important.  The name server is
+an example, I got 
+<A HREF="/jes_engr/">John Schimmel</a> to work on that.
+I'm constantly trying
+to figure out how to make things go fast, which is why I wrote the
+<A HREF="lmbench/lmbench.html">lmbench</a> benchmark suite.  Lmbench 
+measures the basic building blocks of a computer system.
+Occasionally, I have to
+do real work, like the BDS stuff mentioned below.  
+</p>
+<p>
+I live in San Francisco and divide my time there between my girlfriend,
+woodworking, playing pool, and riding motorcycles.
+<H1 align=center>
+Current stuff I'm working on (slides)
+</H1>
+<UL>
+<font size=+1>
+<LI>
+<img src="gifs/eyes.gif">
+<A HREF="lmbench/lmbench.html">
+lmbench benchmark suite with results</A>
+<img src="gifs/eyesleft.gif">
+<br>
+Watch this space for a new lmbench in July.
+<LI><A HREF="diskbench/diskbench.html">New disk benchmarking tools</A>
+<LI><A HREF="lamed.html">New free name server architecture</A>
+<img src="gifs/new.gif">
+<LI><A HREF="talks/bds.ps">Bulk Data Service: 50MB/sec over NFS</A>
+<LI><A HREF="file:/hosts/neteng.engr/home1/lm/Doc/P/EIS.slides.ps">
+EIS project (SGI internal only)</A>
+<LI><A HREF="file:/hosts/neteng.engr/home1/lm/Doc/P/bds.Aug28.96.ps">
+BDS marketing talk of August 28 (SGI internal only)</A>
+<LI><A HREF="file:/hosts/neteng.engr/home1/lm/Doc/P/nsf.Aug28.96.ps">
+SuperHippi presentation for NSF on August 28 (SGI internal only)</A>
+<LI><A HREF="file:/hosts/neteng.engr/home1/lm/Doc/P/net.futures.Sep9.96.ps">
+SGI networking roadmap (SGI internal only)</A>
+</font>
+</UL>
+<img src="gifs/pookline.gif">
+<H1 align=center>
+Papers I've written
+</H1>
+<UL>
+<LI><A HREF="lmbench/lmbench-usenix.ps">lmbench usenix paper</A>
+with <A HREF="http://http.cs.berkeley.edu/~staelin/";>Carl Staelin.</a>
+<LI><A HREF="papers/SunOS.ufs_clustering.ps">SunOS UFS clustering usenix paper</A>
+<LI><A HREF="papers/freeos.ps">A proposal to unify Unix (sigh)</A>
+<LI><A HREF="papers/gkim.ps">A parallel NFS using RPC vectors (coauthor)</A>
+<LI><A HREF="papers/nvram.ps">A early paper describing high perf disks</A>
+<LI><A HREF="papers/smoosh.ps">A description of the core of Sun's Teamware
+source management system</A>
+<LI><A HREF="papers/sunbox.netarch.ps">SparcCluster 1 architecture - VLANs 
+came from this</A>
+<LI><A HREF="lmdd.shar">Latest lmdd benchmarking source</a>
+</UL>
+<img src="gifs/pookline.gif">
+<H1 align=center>
+Personal stuff (lots of pictures)
+</H1>
+<img src="gifs/line1.gif">
+<H2>Me, my relatives, friends, etc.</H2>
+<UL>
+<LI>
+Me and <A HREF="pictures/me+jacob.jpg">my nephew</A>
+Jacob at Ocean Beach in
+San Francisco. He was about 2 years old and still hadn't hit the terrible
+twos, I think his Mom must have done a good job.  
+Here he is with <a href="pictures/annelies+jacob.jpg">his Mom</a> about
+7 months pregnant.  The next one turned out to be a boy 
+<a href="pictures/annelies+zeke.jpg">named Zeke</a>.
+<LI>My brother <A HREF="pictures/chris.jpg">Chris</A> trying to look smart.
+<LI>I used to be even more crazy than I am now; here's a picture of me
+doing some stupid <A HREF="pictures/skating.jpg">rollerblading</A> tricks.
+<LI>My <A HREF="pictures/sail.jpg">favorite</A> picture of me.
+<LI>I work at <A HREF="pictures/working.jpg">home</A> a lot and this is
+what that is like.  My cat was pretty sick in that picture, but I nursed her back to
+the land of the living.
+<LI><A HREF="pictures/me1.jpg">Me studying</A>.
+<LI>A really old picture of me in Mexico, with really long hair
+<A HREF="pictures/juggling.jpg">juggling</A>.
+</UL>
+<img src="gifs/line1.gif">
+<H2>My cats</H2>
+<UL>
+<LI>I like cats and I have had two over the last 18 years (whew) or so.
+Here's <A HREF="pictures/zoey.jpg">Zoey</A> after she's had a few.
+Looks possessed, doesn't she?  Here's a
+<A HREF="pictures/zoey2.jpg">better</a> picture of her.  Until she died around
+Christmas of 1994, she had outlasted all of my girlfriends - I had her
+for almost 14 years.  I still miss her and sometimes look for her when
+I go into the kitchen - it's weird to think she's gone.
+I eventually decided not to mope over her forever and went and 
+found <A HREF="pictures/mama+linux.jpg">Mama cat</A>
+at the pound.  That's Linux running on the PC next to her, she fixes a lot
+of mouse driver bugs.  
+Here's another picture of <A HREF="pictures/cat.jpg">Mama cat on the workbench
+</a>.  And one <A HREF="pictures/mama1.jpg">more</a> of here in my van - she
+likes to travel, no kidding.  One last <A HREF="pictures/mama2.jpg">shot</a>
+of her.
+<LI>November '96:  Mama cat is missing.  
+We're still looking for her, but it has been two weeks and
+it isn't looking very hopeful.
+<LI>January '97:  Mama cat is still missing.  I go to the pound about
+once a week with no luck.  It sucks.
+</UL>
+<img src="gifs/line1.gif">
+<H2>Fishing</H2>
+<UL>
+<LI>I like to fly fish (yeah, I tie my own, ooh, wow) and I took
+a trip with my friend John Weitz.  John is a hot shot 
+photographer and here he is at <A HREF="pictures/jonw-pic.jpg">work</a>.
+Here's John catching a <A
+HREF="pictures/john-fishing.jpg">trout</A> in the Trinity Alps.  This is
+<A HREF="pictures/fishing.jpg">me fishing</A> in the upper Sacramento River.
+John was talking some shots of a cool old 
+<A HREF="pictures/house.jpg">shed</a>,
+so I took one too.  Here's a shot that John took of
+<A HREF="pictures/redneck.jpg">me sitting in the doorway</a> of that shed
+(warning: it's ~60Kb).
+<LI>This is the ultimate in fishing tall tails, except I have pictures
+to prove it happened.  I was fishing in Canada and thought I had hooked
+some weeds.  I was reeling it in when all at once it took off.  Funny sort
+of fish, it felt weird.  When I get closer, I saw that I had two fish -
+a little one that had hit the lure, and a big
+<A HREF="pictures/pike1.jpg">Northern Pike</A> that had hit the little pike.
+I thought for sure he would let go when he saw me, but I guess he was
+hungry, because I
+<A HREF="pictures/pike2.jpg">picked him up</A>.  Pretty wild, huh?
+</UL>
+<img src="gifs/line1.gif">
+<H2>Wilderness</H2>
+<UL>
+<LI>I like to <A HREF="pictures/backpacking.jpg">backpack</a>
+a lot and I have some friends that go with me.  Here's 
+<A HREF="pictures/neail+elvis2.jpg">Neil</a> with his dog Elvis and 
+<A HREF="pictures/neil+elvis.jpg">here</a> they are again hard at work.
+<LI>Me <A HREF="pictures/me-skiing.jpg">cross country skiing</A> in the Sierra
+back country.  It was a weekend trip to Ostrander Hut/Lake (cool place).
+I think that is Yosemite Valley in the background, doesn't that look like
+half dome to you?  Here's the same 
+<A HREF="pictures/skiing.jpg">view</a>
+about 5 years earlier with my friends 
+John G., Bernd N., and Andy A.
+<LI>My Dad's <A HREF="pictures/canoe+cover.jpg">Mad River canoe with a
+cover</A> that my sister made (pretty cool cover, if you ask me, it
+kept us dry).  We go canoeing in Canada quite a bit.
+</UL>
+<img src="gifs/line1.gif">
+<H2>Woodworking</H2>
+<UL>
+<LI>I am not just a computer nerd, I'm also a woodworking nerd, and I'm 
+especially nerdy about <A HREF="pictures/planes.jpg">hand planes.</a>  Many
+of those are a hundred years old, some are more than that ("they don't
+make 'em like they used to" definitely applies to tools).
+Here's my first effort at a real woodworking project, what else, a
+<A HREF="pictures/toolbox3.jpg">toolbox</a>.  Here's a view with the
+<A HREF="pictures/toolbox2.jpg">drawers open.</a>  The little box on top is a
+jewelry box (or whatever) I made for an old girlfriend.
+I live in San Francisco, in
+a flat, so my workshop is out on my <A HREF="pictures/jointer.jpg">back deck</a>.
+That's a small jointer in the foreground and a table saw clamped to the
+rails in the background.  It's a bit cramped, but it has a nice
+<A HREF="pictures/jointer2.jpg">view.</a>
+I finally decided to build a <A HREF="pictures/workbench.jpg">workbench.</a>
+Here's the <A HREF="pictures/benchtop.jpg">benchtop</a> in the process of being
+hand planed flat (lotso shavings, huh?).
+<LI>
+I do stuff on commission sometimes, this is my last girlfriend with a
+<A HREF="pictures/bookshelf.jpg">bookshelf</a> I built for a friend at
+work.  It was pretty simple since it was a first try, but he liked it.
+Here's another picture of the <A HREF="pictures/bookshelf2.jpg">bookshelf</a>.
+<LI>Here I am proudly showing off a little 
+<A HREF="pictures/tv-cabinet+me2.jpg">TV cabinet</a> made out of pine with some
+really interesting grain.  That's the heartwood of the pine.
+Here's a <A HREF="pictures/tv-cabinet.jpg">closeup</a> picture of
+the cabinet.
+
+<LI>
+Because space is tight in San Francisco, I think my next project will be
+a tall, thin
+<A HREF="pictures/chest.gif">
+chest of drawers</a>
+sort of like a lingerie chest, only sized for guy's clothes.  It's about 
+six feet tall by 18 inches square, which I think is about right.  This was
+drawn in James Clark's implementation of pic, in the groff tool suite.
+Perverse, I know.
+<LI>
+Here is a document on <a href="papers/flattening.html">flattening</a>
+hand planes, something that is frequently required for good performance.
+</UL>
+<img src="gifs/line1.gif">
+<H2>Amusements</H2>
+<UL>
+<LI>A <A href="excited.html">song</a> composed in my honor.  No kidding.
+It's pretty cute but you might need to know a little about Sun's internal
+politics to completely get it.
+<p>
+<LI>A 
+<A HREF="javaletter.html">
+letter</a>
+that Sun's lawyers recently sent.  It's amazing how frigging
+self centered people can be.  I got yer Java right here, buddy.
+<p>
+A few days later, the net <A HREF="javaresp.html">responds.</a>
+<p>
+<li>Here are a bunch of
+<A HREF="quote.html">quotes</a>
+that I either liked or were attributed to me.  A lot of these are pretty
+nerdy engineer inside jokes, you've been warned.
+</ul>
+<img src="gifs/cclip3.gif">
+<p>
+<img src="gifs/sgi_logo.gif" align=right>
+<br>
+<address>
+Larry McVoy,
+<a href="mailto:lm@xxxxxxx";>lm@xxxxxxx</a>
+</address>
+<p>
+Page accesses since Wed Jun 26 1996:
+<img align=texttop src="/cgi-bin/Imagemap/hitcount?lm_home"
+alt="[Sorry, counter is a GIF image!]"><br>
+</p>
+
+</body></html>
diff --git a/performance/lmbench3/src/webpage-lm/pictures/me-small.jpg b/performance/lmbench3/src/webpage-lm/pictures/me-small.jpg
new file mode 100644
index 0000000..4205e8c
Binary files /dev/null and b/performance/lmbench3/src/webpage-lm/pictures/me-small.jpg differ
diff --git a/runtests.sh b/runtests.sh
index d93cc70..924df4f 100755
--- a/runtests.sh
+++ b/runtests.sh
@@ -65,8 +65,11 @@ destructive)
 		testset=default
 	fi
 	;;
+performance)
+	dirlist="performance"
+	;;
 *)
-	echo "supported test sets are minimal default stress or destructive"
+	echo "supported test sets are minimal, default, stress, destructive or performance"
 	exit 1
 esac
 
@@ -92,19 +95,24 @@ do
 		#TO DO:  purpose file test name format
 		testname=$testdir
 		echo "Starting test $testname" >> $logfile
-		./runtest.sh &>>$logfile
-		complete=$?
-		case $complete in
-		0)
-			result=PASS
-			;;
-		3)
-			result=SKIP
-			;;
-		*)
-			result=FAIL
-		esac
-		printf "%-65s%-8s\n" "$testname" "$result"
+
+		if [ "$testset" == "performance" ]; then
+			./runtest.sh >>$logfile
+		else
+			./runtest.sh &>>$logfile
+			complete=$?
+			case $complete in
+			0)
+				result=PASS
+				;;
+			3)
+				result=SKIP
+				;;
+			*)
+				result=FAIL
+			esac
+			printf "%-65s%-8s\n" "$testname" "$result"
+		fi
 		popd &>/dev/null
 	done
 done

-- 
To stop receiving notification emails like this one, please contact
the administrator of this repository.
_______________________________________________
kernel mailing list -- kernel@xxxxxxxxxxxxxxxxxxxxxxx
To unsubscribe send an email to kernel-leave@xxxxxxxxxxxxxxxxxxxxxxx