Recent changes (gfio)

Jens Axboe <jaxboe@xxxxxxxxxxxx> · Thu, 15 Nov 2012 06:00:02 +0100 (CET)

The following changes since commit f681d0bac9c42e5d6bcb3601d2f3cfaa1c2cefb4:

  Merge branch 'master' into gfio (2012-10-10 21:43:46 -0600)

are available in the git repository at:

  git://git.kernel.dk/fio.git gfio

Jens Axboe (48):
      Update REPORTING-BUGS
      backend: only check bytes_done for engines without FIO_NOIO
      Add LICENSE file specyfing what I expect from people using fio
      rdma: use private random state
      rdma: fixup compile issue
      rdma: cleanup and fixes
      Add max_latency option
      smalloc: fix int truncation issue
      Merge branch 'master' of ssh://brick.kernel.dk/data/git/fio
      smalloc: move to size_t for allocations
      Add sample zipf distribution randomizer
      Merge branch 'master' of ssh://brick.kernel.dk/data/git/fio
      Add forgotten minmax.h include file
      Add pareto distribution randomizer
      zipf: needs inttypes.h
      Add t/genzipf to play with distribution settings
      t/genzipf update
      genzip updates
      Fix t/ieee754 link
      Move code around to satisfy t/stest linkage
      Make the zipf/pareto state per file
      zipf/pareto: mix blocks with hashing
      rbtree: add rb_next()
      genzipf: improve accuracy
      genzipf: fix off-by-one in output array calculation
      Merge branch 'master' of ssh://brick.kernel.dk/data/git/fio
      Makefile: typo
      Add safe checks for valid pareto input value
      zipf/pareto: ensure that 0 isn't always the hottest block
      zipf: cleanup
      zipf: seed zipf/pareto rand with filename hash and job id
      Zipf theta must be different than 1.0
      zipf: cap range calculation at 10M
      zipf: kill debug "generating series" messages
      Use ETIMEDOUT instead of ETIME for exceeding max latency
      Use more portable construct for setting/checking CC
      Makefile: 2nd go at making CC assignment/check actually work
      genzipf: more features
      zipf: use 64-bit safe hash for zipf/pareto
      genzipf: add size and percentage hit rates
      JSON: fix escape of '"' and '\' characters
      genzipf: use regular array + qsort() instead of rbtree
      json: fix off-by-one in memory alloc
      e4defrag: fix always true comparison
      Fix wrong return type on td_error_type()
      Get rid of uninitialized_var()
      client/server: fixup "All clients" reporting
      Merge branch 'master' into gfio

Yufei Ren (5):
      cpuio engine cpuload bug fix
      thread cpu resource statistics bug fix
      Fine-grained job level numa control
      Replace FIO_HAVE_RUSAGE_THREAD with RUSAGE_THREAD
      rdma ioengine improvement

 HOWTO               |   23 ++++-
 LICENSE             |   17 +++
 Makefile            |   20 +++-
 README              |   14 ++-
 REPORTING-BUGS      |    3 +-
 backend.c           |   48 ++++++++-
 cconv.c             |    8 ++
 client.c            |   18 +++-
 client.h            |    2 +
 compiler/compiler.h |    2 -
 engines/e4defrag.c  |    2 +-
 engines/rdma.c      |  142 ++++++++++++++-----------
 engines/splice.c    |    2 +-
 eta.c               |    5 +-
 examples/cpuio      |    8 ++
 examples/numa       |   21 ++++
 examples/zipf       |   10 ++
 file.h              |    6 +
 filesetup.c         |   39 +++++++
 fio.1               |   26 +++++
 fio.h               |   20 ++++-
 gettime-thread.c    |   78 ++++++++++++++
 gettime.c           |  101 +++++++++---------
 gettime.h           |    2 +
 hash.h              |   16 +++-
 init.c              |    4 +
 io_u.c              |   50 +++++++--
 json.c              |   31 ++++++-
 lib/rbtree.c        |   31 ++++++
 lib/rbtree.h        |    1 +
 lib/zipf.c          |   88 ++++++++++++++++
 lib/zipf.h          |   23 ++++
 minmax.h            |   11 ++
 options.c           |  207 +++++++++++++++++++++++++++++++++++++
 parse.c             |    3 +-
 parse.h             |    8 +--
 server.c            |    6 +
 server.h            |    3 +-
 smalloc.c           |   33 +++---
 smalloc.h           |    2 +-
 stat.c              |    4 +
 t/genzipf.c         |  287 +++++++++++++++++++++++++++++++++++++++++++++++++++
 t/log.c             |   13 +++
 t/stest.c           |    6 +
 thread_options.h    |   20 ++++
 time.c              |   66 ------------
 46 files changed, 1291 insertions(+), 239 deletions(-)
 create mode 100644 LICENSE
 create mode 100644 examples/cpuio
 create mode 100644 examples/numa
 create mode 100644 examples/zipf
 create mode 100644 gettime-thread.c
 create mode 100644 lib/zipf.c
 create mode 100644 lib/zipf.h
 create mode 100644 minmax.h
 create mode 100644 t/genzipf.c

---

Diff of recent changes:

diff --git a/HOWTO b/HOWTO
index 1fb30db..5611814 100644
--- a/HOWTO
+++ b/HOWTO
@@ -780,6 +780,9 @@ rate_iops_min=int If fio doesn't meet this rate of IO, it will cause
 		the job to exit. The same format as rate is used for read vs
 		write seperation.
 
+max_latency=int	If set, fio will exit the job if it exceeds this maximum
+		latency. It will exit with an ETIME error.
+
 ratecycle=int	Average bandwidth for 'rate' and 'ratemin' over this number
 		of milliseconds.
 
@@ -799,6 +802,24 @@ cpus_allowed=str Controls the same options as cpumask, but it allows a text
 		allows a range of CPUs. Say you wanted a binding to CPUs
 		1, 5, and 8-15, you would set cpus_allowed=1,5,8-15.
 
+numa_cpu_nodes=str Set this job running on spcified NUMA nodes' CPUs. The
+		arguments allow comma delimited list of cpu numbers,
+		A-B ranges, or 'all'. Note, to enable numa options support,
+		export the following environment variables,
+			export EXTFLAGS+=" -DFIO_HAVE_LIBNUMA "
+			export EXTLIBS+=" -lnuma "
+
+numa_mem_policy=str Set this job's memory policy and corresponding NUMA
+		nodes. Format of the argements:
+			<mode>[:<nodelist>]
+		`mode' is one of the following memory policy:
+			default, prefer, bind, interleave, local
+		For `default' and `local' memory policy, no node is
+		needed to be specified.
+		For `prefer', only one node is allowed.
+		For `bind' and `interleave', it allow comma delimited
+		list of numbers, A-B ranges, or 'all'.
+
 startdelay=time	Start this job the specified number of seconds after fio
 		has started. Only useful if the job file contains several
 		jobs, and you want to delay starting some jobs to a certain
@@ -1363,7 +1384,7 @@ Idle	Run
 ----    ---
 P		Thread setup, but not started.
 C		Thread created.
-I		Thread initialized, waiting.
+I		Thread initialized, waiting or generating necessary data.
 	p	Thread running pre-reading file(s).
 	R	Running, doing sequential reads.
 	r	Running, doing random reads.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d7c0b1b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,17 @@
+As specified by the COPYING file, fio is free software published under version
+2 of the GPL license. That covers the copying part of the license. By using fio,
+you are also promising to uphold the following moral obligations:
+
+- If you publish results that are done using fio, it must be clearly stated
+  that fio was used. The specific version should also be listed.
+
+- If you develop features or bug fixes for fio, they should be sent upstream
+  for inclusion into the main repository. This isn't specific to fio, that
+  is a general rule for any open source project. It's just the Right Thing
+  to do. Plus it means that you don't have to maintain the feature or change
+  internally. In the long run, this is saving you a lot of time.
+
+I would consider the above to fall under "common courtesy", but since
+people tend to have differing opinions of that, it doesn't hurt to spell out
+my expectations clearly.
+
diff --git a/Makefile b/Makefile
index d851640..b0b6857 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,6 @@
-CC	?= gcc
+ifneq ($(origin CC), environment)
+CC	= gcc
+endif
 DEBUGFLAGS = -D_FORTIFY_SOURCE=2 -DFIO_INC_DEBUG
 CPPFLAGS= -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 \
 	$(DEBUGFLAGS)
@@ -18,7 +20,7 @@ SOURCE := gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \
 		lib/num2str.c lib/ieee754.c $(wildcard crc/*.c) engines/cpu.c \
 		engines/mmap.c engines/sync.c engines/null.c engines/net.c \
 		memalign.c server.c client.c iolog.c backend.c libfio.c flow.c \
-		cconv.c lib/prio_tree.c json.c
+		cconv.c lib/prio_tree.c lib/zipf.c json.c gettime-thread.c
 
 ifeq ($(UNAME), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c helpers.c cgroup.c trim.c \
@@ -75,15 +77,24 @@ GFIO_OBJS = $(OBJS) gfio.o graph.o tickmarks.o ghelpers.o goptions.o gerror.o \
 			gclient.o gcompat.o cairo_text_helpers.o printing.o
 
 T_SMALLOC_OBJS = t/stest.o
-T_SMALLOC_OBJS += mutex.o smalloc.o t/log.o gettime.o time.o
+T_SMALLOC_OBJS += gettime.o mutex.o smalloc.o t/log.o
 T_SMALLOC_PROGS = t/stest
 
 T_IEEE_OBJS = t/ieee754.o
 T_IEEE_OBJS += lib/ieee754.o
 T_IEEE_PROGS = t/ieee754
 
+T_ZIPF_OBS = t/genzipf.o
+T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/zipf.o t/genzipf.o
+T_ZIPF_PROGS = t/genzipf
+
 T_OBJS = $(T_SMALLOC_OBJS)
 T_OBJS += $(T_IEEE_OBJS)
+T_OBJS += $(T_ZIPF_OBJS)
+
+T_PROGS = $(T_SMALLOC_PROGS)
+T_PROGS += $(T_IEEE_PROGS)
+T_PROGS += $(T_ZIPF_PROGS)
 
 ifneq ($(findstring $(MAKEFLAGS),s),s)
 ifndef V
@@ -158,6 +169,9 @@ fio: $(FIO_OBJS)
 gfio: $(GFIO_OBJS)
 	$(QUIET_CC)$(CC) $(LIBS) -o gfio $(GFIO_OBJS) $(LIBS) $(GTK_LDFLAGS)
 
+t/genzipf: $(T_ZIPF_OBJS)
+	$(QUIET_CC)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_ZIPF_OBJS) $(LIBS) $(LDFLAGS)
+
 .depend: $(SOURCE)
 	$(QUIET_DEP)$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SOURCE) 1> .depend
 
diff --git a/README b/README
index 535b077..ceac385 100644
--- a/README
+++ b/README
@@ -233,10 +233,11 @@ The job file parameters are:
 			readv/writev (with queuing emulation) mmap for mmap'ed
 			io, syslet-rw for syslet driven read/write, splice for
 			using splice/vmsplice, sg for direct SG_IO io, net
-			for network io, or cpuio for a cycler burner load. sg
-			only works on Linux on SCSI (or SCSI-like devices, such
-			as usb-storage or sata/libata driven) devices. Fio also
-			has a null io engine, which is mainly used for testing
+			for network io, rdma for RDMA io, or cpuio for a
+			cycler burner load. sg only works on Linux on
+			SCSI (or SCSI-like devices, such as usb-storage or
+			sata/libata driven) devices. Fio also has a null
+			io engine, which is mainly used for testing
 			fio itself.
 
 	iodepth=x	For async io, allow 'x' ios in flight
@@ -255,6 +256,11 @@ The job file parameters are:
 	ratecycle=x	ratemin averaged over x msecs
 	cpumask=x	Only allow job to run on CPUs defined by mask.
 	cpus_allowed=x	Like 'cpumask', but allow text setting of CPU affinity.
+	numa_cpu_nodes=x,y-z  Allow job to run on specified NUMA nodes' CPU.
+	numa_mem_policy=m:x,y-z  Setup numa memory allocation policy.
+			'm' stands for policy, such as local, interleave,
+			bind, prefer, local. 'x, y-z' are numa node(s) for
+			memory allocation according to policy.
 	fsync=x		If writing with buffered IO, fsync after every
 			'x' blocks have been written.
 	end_fsync=x	If 'x', run fsync() after end-of-job.
diff --git a/REPORTING-BUGS b/REPORTING-BUGS
index ded68ab..c6150d1 100644
--- a/REPORTING-BUGS
+++ b/REPORTING-BUGS
@@ -8,8 +8,7 @@ to report at least:
 1) A description of what you think the bug is
 2) Environment (Linux distro version, kernel version). This is mostly
    needed if it's a build bug.
-3) The fio version. The most useful is the git top level commit, you
-   can find out by doing a cat .git/HEAD from the top level fio directory.
+3) The output from fio --version.
 4) How to reproduce. Please include a full list of the parameters
    passed to fio and the job file used (if any).
 
diff --git a/backend.c b/backend.c
index 974384c..39ef759 100644
--- a/backend.c
+++ b/backend.c
@@ -64,6 +64,7 @@ struct io_log *agg_io_log[DDIR_RWDIR_CNT];
 
 int groupid = 0;
 unsigned int thread_number = 0;
+unsigned int stat_number = 0;
 int shm_id = 0;
 int temp_stall_ts;
 unsigned long done_secs = 0;
@@ -591,7 +592,7 @@ static void do_io(struct thread_data *td)
 		int ret2, full;
 		enum fio_ddir ddir;
 
-		if (td->terminate)
+		if (td->terminate || td->done)
 			break;
 
 		update_tv_cache(td);
@@ -726,7 +727,7 @@ sync_done:
 
 		if (ret < 0)
 			break;
-		if (!ddir_rw_sum(bytes_done))
+		if (!ddir_rw_sum(bytes_done) && !(td->io_ops->flags & FIO_NOIO))
 			continue;
 
 		if (!in_ramp_time(td) && should_check_rate(td, bytes_done)) {
@@ -1064,6 +1065,49 @@ static void *thread_main(void *data)
 	if (fio_pin_memory(td))
 		goto err;
 
+#ifdef FIO_HAVE_LIBNUMA
+	/* numa node setup */
+	if (td->o.numa_cpumask_set || td->o.numa_memmask_set) {
+		int ret;
+
+		if (numa_available() < 0) {
+			td_verror(td, errno, "Does not support NUMA API\n");
+			goto err;
+		}
+
+		if (td->o.numa_cpumask_set) {
+			ret = numa_run_on_node_mask(td->o.numa_cpunodesmask);
+			if (ret == -1) {
+				td_verror(td, errno, \
+					"numa_run_on_node_mask failed\n");
+				goto err;
+			}
+		}
+
+		if (td->o.numa_memmask_set) {
+
+			switch (td->o.numa_mem_mode) {
+			case MPOL_INTERLEAVE:
+				numa_set_interleave_mask(td->o.numa_memnodesmask);
+				break;
+			case MPOL_BIND:
+				numa_set_membind(td->o.numa_memnodesmask);
+				break;
+			case MPOL_LOCAL:
+				numa_set_localalloc();
+				break;
+			case MPOL_PREFERRED:
+				numa_set_preferred(td->o.numa_mem_prefer_node);
+				break;
+			case MPOL_DEFAULT:
+			default:
+				break;
+			}
+
+		}
+	}
+#endif
+
 	/*
 	 * May alter parameters that init_io_u() will use, so we need to
 	 * do this first.
diff --git a/cconv.c b/cconv.c
index b023315..ca97c73 100644
--- a/cconv.c
+++ b/cconv.c
@@ -118,6 +118,9 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->softrandommap = le32_to_cpu(top->softrandommap);
 	o->bs_unaligned = le32_to_cpu(top->bs_unaligned);
 	o->fsync_on_close = le32_to_cpu(top->fsync_on_close);
+	o->random_distribution = le32_to_cpu(top->random_distribution);
+	o->zipf_theta.u.f = fio_uint64_to_double(le64_to_cpu(top->zipf_theta.u.i));
+	o->pareto_h.u.f = fio_uint64_to_double(le64_to_cpu(top->pareto_h.u.i));
 	o->hugepage_size = le32_to_cpu(top->hugepage_size);
 	o->rw_min_bs = le32_to_cpu(top->rw_min_bs);
 	o->thinktime = le32_to_cpu(top->thinktime);
@@ -143,6 +146,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->loops = le32_to_cpu(top->loops);
 	o->mem_type = le32_to_cpu(top->mem_type);
 	o->mem_align = le32_to_cpu(top->mem_align);
+	o->max_latency = le32_to_cpu(top->max_latency);
 	o->stonewall = le32_to_cpu(top->stonewall);
 	o->new_group = le32_to_cpu(top->new_group);
 	o->numjobs = le32_to_cpu(top->numjobs);
@@ -267,6 +271,9 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->softrandommap = cpu_to_le32(o->softrandommap);
 	top->bs_unaligned = cpu_to_le32(o->bs_unaligned);
 	top->fsync_on_close = cpu_to_le32(o->fsync_on_close);
+	top->random_distribution = cpu_to_le32(o->random_distribution);
+	top->zipf_theta.u.i = __cpu_to_le64(fio_double_to_uint64(o->zipf_theta.u.f));
+	top->pareto_h.u.i = __cpu_to_le64(fio_double_to_uint64(o->pareto_h.u.f));
 	top->hugepage_size = cpu_to_le32(o->hugepage_size);
 	top->rw_min_bs = cpu_to_le32(o->rw_min_bs);
 	top->thinktime = cpu_to_le32(o->thinktime);
@@ -281,6 +288,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->loops = cpu_to_le32(o->loops);
 	top->mem_type = cpu_to_le32(o->mem_type);
 	top->mem_align = cpu_to_le32(o->mem_align);
+	top->max_latency = cpu_to_le32(o->max_latency);
 	top->stonewall = cpu_to_le32(o->stonewall);
 	top->new_group = cpu_to_le32(o->new_group);
 	top->numjobs = cpu_to_le32(o->numjobs);
diff --git a/client.c b/client.c
index 7b8dc61..9cbbcf6 100644
--- a/client.c
+++ b/client.c
@@ -55,6 +55,7 @@ struct group_run_stats client_gs;
 int sum_stat_clients;
 
 static int sum_stat_nr;
+static int do_output_all_clients;
 
 #define FIO_CLIENT_HASH_BITS	7
 #define FIO_CLIENT_HASH_SZ	(1 << FIO_CLIENT_HASH_BITS)
@@ -116,6 +117,9 @@ void fio_put_client(struct fio_client *client)
 	if (client->ini_file)
 		free(client->ini_file);
 
+	if (!client->did_stat)
+		sum_stat_clients -= client->nr_stat;
+
 	free(client);
 }
 
@@ -142,8 +146,6 @@ static void remove_client(struct fio_client *client)
 		client->ops->removed(client);
 
 	nr_clients--;
-	sum_stat_clients--;
-
 	fio_put_client(client);
 }
 
@@ -776,8 +778,9 @@ static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd)
 	struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
 
 	show_thread_status(&p->ts, &p->rs);
+	client->did_stat = 1;
 
-	if (sum_stat_clients == 1)
+	if (!do_output_all_clients)
 		return;
 
 	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr);
@@ -1016,7 +1019,13 @@ static void handle_start(struct fio_client *client, struct fio_net_cmd *cmd)
 	struct cmd_start_pdu *pdu = (struct cmd_start_pdu *) cmd->payload;
 
 	client->state = Client_started;
-	client->jobs = pdu->jobs;
+	client->jobs = le32_to_cpu(pdu->jobs);
+	client->nr_stat = le32_to_cpu(pdu->stat_outputs);
+
+	if (sum_stat_clients > 1)
+		do_output_all_clients = 1;
+
+	sum_stat_clients += client->nr_stat;
 }
 
 static void handle_stop(struct fio_client *client, struct fio_net_cmd *cmd)
@@ -1345,7 +1354,6 @@ int fio_handle_clients(struct client_ops *ops)
 
 	pfds = malloc(nr_clients * sizeof(struct pollfd));
 
-	sum_stat_clients = nr_clients;
 	init_thread_stat(&client_ts);
 	init_group_run_stat(&client_gs);
 
diff --git a/client.h b/client.h
index 341d260..10d6ec3 100644
--- a/client.h
+++ b/client.h
@@ -42,10 +42,12 @@ struct fio_client {
 	int is_sock;
 	int disk_stats_shown;
 	unsigned int jobs;
+	unsigned int nr_stat;
 	int error;
 	int signal;
 	int ipv6;
 	int sent_job;
+	int did_stat;
 	uint32_t type;
 
 	uint32_t thread_number;
diff --git a/compiler/compiler.h b/compiler/compiler.h
index 8dec350..8923f9a 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -13,8 +13,6 @@
 #define __must_check
 #endif
 
-#define uninitialized_var(x) x = x
-
 #ifndef _weak
 #ifndef __CYGWIN__
 #define _weak	__attribute__((weak))
diff --git a/engines/e4defrag.c b/engines/e4defrag.c
index cc88493..e10cf36 100644
--- a/engines/e4defrag.c
+++ b/engines/e4defrag.c
@@ -161,7 +161,7 @@ static int fio_e4defrag_queue(struct thread_data *td, struct io_u *io_u)
 	ret = ioctl(f->fd, EXT4_IOC_MOVE_EXT, &me);
 	len = me.moved_len * ed->bsz;
 
-	if (io_u->file && len >= 0 && ddir_rw(io_u->ddir))
+	if (io_u->file && len && ddir_rw(io_u->ddir))
 		io_u->file->file_pos = io_u->offset + len;
 
 	if (len > io_u->xfer_buflen)
diff --git a/engines/rdma.c b/engines/rdma.c
index 79d72d2..9b18301 100644
--- a/engines/rdma.c
+++ b/engines/rdma.c
@@ -7,14 +7,25 @@
  *
  * This I/O engine is disabled by default. To enable it, execute:
  *
- * $ export EXTFLAGS="-DFIO_HAVE_RDMA"
- * $ export EXTLIBS="-libverbs -lrdmacm"
+ * $ export EXTFLAGS+=" -DFIO_HAVE_RDMA "
+ * $ export EXTLIBS+=" -libverbs -lrdmacm "
  *
  * before running make. You will need the Linux RDMA software as well, either
  * from your Linux distributor or directly from openfabrics.org:
  *
  * http://www.openfabrics.org/downloads/OFED/
  *
+ * Exchanging steps of RDMA ioengine control messages:
+ *	1. client side sends test mode (RDMA_WRITE/RDMA_READ/SEND)
+ *	   to server side.
+ *	2. server side parses test mode, and sends back confirmation
+ *	   to client side. In RDMA WRITE/READ test, this confirmation
+ *	   includes memory information, such as rkey, address.
+ *	3. client side initiates test loop.
+ *	4. In RDMA WRITE/READ test, client side sends a completion
+ *	   notification to server side. Server side updates its
+ *	   td->done as true.
+ *
  */
 #include <stdio.h>
 #include <stdlib.h>
@@ -35,13 +46,14 @@
 #include <inttypes.h>
 
 #include "../fio.h"
+#include "../hash.h"
 
 #ifdef FIO_HAVE_RDMA
 
 #include <rdma/rdma_cma.h>
 #include <infiniband/arch.h>
 
-#define FIO_RDMA_MAX_IO_DEPTH    128
+#define FIO_RDMA_MAX_IO_DEPTH    512
 
 enum rdma_io_mode {
 	FIO_RDMA_UNKNOWN = 0,
@@ -108,6 +120,8 @@ struct rdmaio_data {
 	int io_u_flight_nr;
 	struct io_u **io_us_completed;
 	int io_u_completed_nr;
+
+	struct frand_state rand_state;
 };
 
 static int client_recv(struct thread_data *td, struct ibv_wc *wc)
@@ -311,6 +325,7 @@ static int fio_rdmaio_setup_qp(struct thread_data *td)
 		rd->pd = ibv_alloc_pd(rd->child_cm_id->verbs);
 	else
 		rd->pd = ibv_alloc_pd(rd->cm_id->verbs);
+
 	if (rd->pd == NULL) {
 		log_err("fio: ibv_alloc_pd fail\n");
 		return 1;
@@ -402,7 +417,7 @@ static int fio_rdmaio_setup_control_msg_buffers(struct thread_data *td)
 	/* setup work request */
 	/* recv wq */
 	rd->recv_sgl.addr = (uint64_t) (unsigned long)&rd->recv_buf;
-	rd->recv_sgl.length = sizeof rd->recv_buf;
+	rd->recv_sgl.length = sizeof(rd->recv_buf);
 	rd->recv_sgl.lkey = rd->recv_mr->lkey;
 	rd->rq_wr.sg_list = &rd->recv_sgl;
 	rd->rq_wr.num_sge = 1;
@@ -410,7 +425,7 @@ static int fio_rdmaio_setup_control_msg_buffers(struct thread_data *td)
 
 	/* send wq */
 	rd->send_sgl.addr = (uint64_t) (unsigned long)&rd->send_buf;
-	rd->send_sgl.length = sizeof rd->send_buf;
+	rd->send_sgl.length = sizeof(rd->send_buf);
 	rd->send_sgl.lkey = rd->send_mr->lkey;
 
 	rd->sq_wr.opcode = IBV_WR_SEND;
@@ -427,13 +442,12 @@ static int get_next_channel_event(struct thread_data *td,
 				  enum rdma_cm_event_type wait_event)
 {
 	struct rdmaio_data *rd = td->io_ops->data;
-
-	int ret;
 	struct rdma_cm_event *event;
+	int ret;
 
 	ret = rdma_get_cm_event(channel, &event);
 	if (ret) {
-		log_err("fio: rdma_get_cm_event");
+		log_err("fio: rdma_get_cm_event: %d\n", ret);
 		return 1;
 	}
 
@@ -507,9 +521,9 @@ static struct io_u *fio_rdmaio_event(struct thread_data *td, int event)
 	int i;
 
 	io_u = rd->io_us_completed[0];
-	for (i = 0; i < rd->io_u_completed_nr - 1; i++) {
+	for (i = 0; i < rd->io_u_completed_nr - 1; i++)
 		rd->io_us_completed[i] = rd->io_us_completed[i + 1];
-	}
+
 	rd->io_u_completed_nr--;
 
 	dprint_io_u(io_u, "fio_rdmaio_event");
@@ -521,14 +535,11 @@ static int fio_rdmaio_getevents(struct thread_data *td, unsigned int min,
 				unsigned int max, struct timespec *t)
 {
 	struct rdmaio_data *rd = td->io_ops->data;
-	int r;
 	enum ibv_wc_opcode comp_opcode;
 	comp_opcode = IBV_WC_RDMA_WRITE;
 	struct ibv_cq *ev_cq;
 	void *ev_ctx;
-	int ret;
-
-	r = 0;
+	int ret, r = 0;
 
 	switch (rd->rdma_protocol) {
 	case FIO_RDMA_MEM_WRITE:
@@ -591,7 +602,8 @@ static int fio_rdmaio_send(struct thread_data *td, struct io_u **io_us,
 	enum ibv_wc_opcode comp_opcode;
 	comp_opcode = IBV_WC_RDMA_WRITE;
 #endif
-	int i, index;
+	int i;
+	long index;
 	struct rdma_io_u_data *r_io_u_d;
 
 	r_io_u_d = NULL;
@@ -602,7 +614,7 @@ static int fio_rdmaio_send(struct thread_data *td, struct io_u **io_us,
 		case FIO_RDMA_MEM_WRITE:
 			/* compose work request */
 			r_io_u_d = io_us[i]->engine_data;
-			index = rand() % rd->rmt_nr;
+			index = __rand(&rd->rand_state) % rd->rmt_nr;
 			r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_WRITE;
 			r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
 			r_io_u_d->sq_wr.wr.rdma.remote_addr = \
@@ -612,7 +624,7 @@ static int fio_rdmaio_send(struct thread_data *td, struct io_u **io_us,
 		case FIO_RDMA_MEM_READ:
 			/* compose work request */
 			r_io_u_d = io_us[i]->engine_data;
-			index = rand() % rd->rmt_nr;
+			index = __rand(&rd->rand_state) % rd->rmt_nr;
 			r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_READ;
 			r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
 			r_io_u_d->sq_wr.wr.rdma.remote_addr = \
@@ -734,11 +746,11 @@ static int fio_rdmaio_commit(struct thread_data *td)
 	io_us = rd->io_us_queued;
 	do {
 		/* RDMA_WRITE or RDMA_READ */
-		if (rd->is_client) {
+		if (rd->is_client)
 			ret = fio_rdmaio_send(td, io_us, rd->io_u_queued_nr);
-		} else if (!rd->is_client) {
+		else if (!rd->is_client)
 			ret = fio_rdmaio_recv(td, io_us, rd->io_u_queued_nr);
-		} else
+		else
 			ret = 0;	/* must be a SYNC */
 
 		if (ret > 0) {
@@ -760,7 +772,7 @@ static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
 	struct rdma_conn_param conn_param;
 	struct ibv_send_wr *bad_wr;
 
-	memset(&conn_param, 0, sizeof conn_param);
+	memset(&conn_param, 0, sizeof(conn_param));
 	conn_param.responder_resources = 1;
 	conn_param.initiator_depth = 1;
 	conn_param.retry_count = 10;
@@ -790,6 +802,16 @@ static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
 	/* wait for remote MR info from server side */
 	rdma_poll_wait(td, IBV_WC_RECV);
 
+	/* In SEND/RECV test, it's a good practice to setup the iodepth of
+	 * of the RECV side deeper than that of the SEND side to
+	 * avoid RNR (receiver not ready) error. The
+	 * SEND side may send so many unsolicited message before
+	 * RECV side commits sufficient recv buffers into recv queue.
+	 * This may lead to RNR error. Here, SEND side pauses for a while
+	 * during which RECV side commits sufficient recv buffers.
+	 */
+	usleep(500000);
+
 	return 0;
 }
 
@@ -800,7 +822,7 @@ static int fio_rdmaio_accept(struct thread_data *td, struct fio_file *f)
 	struct ibv_send_wr *bad_wr;
 
 	/* rdma_accept() - then wait for accept success */
-	memset(&conn_param, 0, sizeof conn_param);
+	memset(&conn_param, 0, sizeof(conn_param));
 	conn_param.responder_resources = 1;
 	conn_param.initiator_depth = 1;
 
@@ -863,17 +885,20 @@ static int fio_rdmaio_close_file(struct thread_data *td, struct fio_file *f)
 		rdma_disconnect(rd->cm_id);
 	else {
 		rdma_disconnect(rd->child_cm_id);
-/*        rdma_disconnect(rd->cm_id); */
+#if 0
+		rdma_disconnect(rd->cm_id);
+#endif
 	}
 
-/*    if (get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_DISCONNECTED) != 0)
-    {
-        log_err("fio: wait for RDMA_CM_EVENT_DISCONNECTED\n");
-        return 1;
-    }*/
+#if 0
+	if (get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_DISCONNECTED) != 0) {
+		log_err("fio: wait for RDMA_CM_EVENT_DISCONNECTED\n");
+		return 1;
+	}
+#endif
 
-	ibv_destroy_qp(rd->qp);
 	ibv_destroy_cq(rd->cq);
+	ibv_destroy_qp(rd->qp);
 
 	if (rd->is_client == 1)
 		rdma_destroy_id(rd->cm_id);
@@ -893,6 +918,7 @@ static int fio_rdmaio_setup_connect(struct thread_data *td, const char *host,
 {
 	struct rdmaio_data *rd = td->io_ops->data;
 	struct ibv_recv_wr *bad_wr;
+	int err;
 
 	rd->addr.sin_family = AF_INET;
 	rd->addr.sin_port = htons(port);
@@ -910,28 +936,28 @@ static int fio_rdmaio_setup_connect(struct thread_data *td, const char *host,
 	}
 
 	/* resolve route */
-	if (rdma_resolve_addr(rd->cm_id, NULL,
-			      (struct sockaddr *)&rd->addr, 2000) != 0) {
-		log_err("fio: rdma_resolve_addr");
+	err = rdma_resolve_addr(rd->cm_id, NULL, (struct sockaddr *)&rd->addr, 2000);
+	if (err != 0) {
+		log_err("fio: rdma_resolve_addr: %d\n", err);
 		return 1;
 	}
 
-	if (get_next_channel_event
-	    (td, rd->cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED)
-	    != 0) {
-		log_err("fio: get_next_channel_event");
+	err = get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED);
+	if (err != 0) {
+		log_err("fio: get_next_channel_event: %d\n", err);
 		return 1;
 	}
 
 	/* resolve route */
-	if (rdma_resolve_route(rd->cm_id, 2000) != 0) {
-		log_err("fio: rdma_resolve_route");
+	err = rdma_resolve_route(rd->cm_id, 2000);
+	if (err != 0) {
+		log_err("fio: rdma_resolve_route: %d\n", err);
 		return 1;
 	}
 
-	if (get_next_channel_event
-	    (td, rd->cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED) != 0) {
-		log_err("fio: get_next_channel_event");
+	err = get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED);
+	if (err != 0) {
+		log_err("fio: get_next_channel_event: %d\n", err);
 		return 1;
 	}
 
@@ -943,8 +969,9 @@ static int fio_rdmaio_setup_connect(struct thread_data *td, const char *host,
 		return 1;
 
 	/* post recv buf */
-	if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
-		log_err("fio: ibv_post_recv fail\n");
+	err = ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr);
+	if (err != 0) {
+		log_err("fio: ibv_post_recv fail: %d\n", err);
 		return 1;
 	}
 
@@ -996,10 +1023,12 @@ static int fio_rdmaio_setup_listen(struct thread_data *td, short port)
 static int fio_rdmaio_init(struct thread_data *td)
 {
 	struct rdmaio_data *rd = td->io_ops->data;
+	struct flist_head *entry;
+	unsigned int max_bs;
 	unsigned int port;
 	char host[64], buf[128];
 	char *sep, *portp, *modep;
-	int ret;
+	int ret, i = 0;
 	struct rlimit rl;
 
 	if (td_rw(td)) {
@@ -1119,11 +1148,8 @@ static int fio_rdmaio_init(struct thread_data *td)
 		ret = fio_rdmaio_setup_connect(td, host, port);
 	}
 
-	struct flist_head *entry;
-	unsigned int max_bs;
 	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
 	/* register each io_u in the free list */
-	int i = 0;
 	flist_for_each(entry, &td->io_u_freelist) {
 		struct io_u *io_u = flist_entry(entry, struct io_u, list);
 
@@ -1145,8 +1171,9 @@ static int fio_rdmaio_init(struct thread_data *td)
 		rd->send_buf.rmt_us[i].rkey = htonl(io_u->mr->rkey);
 		rd->send_buf.rmt_us[i].size = htonl(max_bs);
 
-/*    log_info("fio: Send rkey %x addr %" PRIx64 " len %d to client\n",
-          io_u->mr->rkey, io_u->buf, max_bs); */
+#if 0
+		log_info("fio: Send rkey %x addr %" PRIx64 " len %d to client\n", io_u->mr->rkey, io_u->buf, max_bs); */
+#endif
 		i++;
 	}
 
@@ -1162,16 +1189,8 @@ static void fio_rdmaio_cleanup(struct thread_data *td)
 {
 	struct rdmaio_data *rd = td->io_ops->data;
 
-	if (rd) {
-/*        if (nd->listenfd != -1)
-            close(nd->listenfd);
-        if (nd->pipes[0] != -1)
-            close(nd->pipes[0]);
-        if (nd->pipes[1] != -1)
-            close(nd->pipes[1]);
-*/
+	if (rd)
 		free(rd);
-	}
 }
 
 static int fio_rdmaio_setup(struct thread_data *td)
@@ -1179,9 +1198,10 @@ static int fio_rdmaio_setup(struct thread_data *td)
 	struct rdmaio_data *rd;
 
 	if (!td->io_ops->data) {
-		rd = malloc(sizeof(*rd));;
+		rd = malloc(sizeof(*rd));
 
 		memset(rd, 0, sizeof(*rd));
+		init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME);
 		td->io_ops->data = rd;
 	}
 
@@ -1229,8 +1249,8 @@ static int fio_rdmaio_init(struct thread_data fio_unused * td)
 	log_err("     make sure OFED is installed,\n");
 	log_err("     $ ofed_info\n");
 	log_err("     then try to make fio as follows:\n");
-	log_err("     $ export EXTFLAGS=\"-DFIO_HAVE_RDMA\"\n");
-	log_err("     $ export EXTLIBS=\"-libverbs -lrdmacm\"\n");
+	log_err("     $ export EXTFLAGS+=\" -DFIO_HAVE_RDMA \"\n");
+	log_err("     $ export EXTLIBS+=\" -libverbs -lrdmacm \"\n");
 	log_err("     $ make clean && make\n");
 	return 1;
 }
diff --git a/engines/splice.c b/engines/splice.c
index aa00234..ca7997b 100644
--- a/engines/splice.c
+++ b/engines/splice.c
@@ -204,7 +204,7 @@ static int fio_splice_write(struct thread_data *td, struct io_u *io_u)
 static int fio_spliceio_queue(struct thread_data *td, struct io_u *io_u)
 {
 	struct spliceio_data *sd = td->io_ops->data;
-	int uninitialized_var(ret);
+	int ret = 0;
 
 	fio_ro_check(td, io_u);
 
diff --git a/eta.c b/eta.c
index 600b046..bcf0676 100644
--- a/eta.c
+++ b/eta.c
@@ -78,6 +78,7 @@ static void check_str_update(struct thread_data *td)
 		c = 'C';
 		break;
 	case TD_INITIALIZED:
+	case TD_SETTING_UP:
 		c = 'I';
 		break;
 	case TD_NOT_CREATED:
@@ -318,7 +319,9 @@ int calc_thread_status(struct jobs_eta *je, int force)
 		} else if (td->runstate == TD_RAMP) {
 			je->nr_running++;
 			je->nr_ramp++;
-		} else if (td->runstate < TD_RUNNING)
+		} else if (td->runstate == TD_SETTING_UP)
+			je->nr_running++;
+		else if (td->runstate < TD_RUNNING)
 			je->nr_pending++;
 
 		if (je->elapsed_sec >= 3)
diff --git a/examples/cpuio b/examples/cpuio
new file mode 100644
index 0000000..577e072
--- /dev/null
+++ b/examples/cpuio
@@ -0,0 +1,8 @@
+[global]
+ioengine=cpuio
+time_based
+runtime=10
+
+[burn50percent]
+cpuload=50
+
diff --git a/examples/numa b/examples/numa
new file mode 100644
index 0000000..b81964f
--- /dev/null
+++ b/examples/numa
@@ -0,0 +1,21 @@
+; setup numa policy for each thread
+; 'numactl --show' to determine the maximum numa nodes
+[global]
+ioengine=libaio
+buffered=0
+rw=randread
+bs=512K
+iodepth=16
+size=512m
+filename=/dev/sdb1
+
+; Fix memory blocks (512K * 16) in numa node 0
+[job1]
+numa_cpu_nodes=0
+numa_mem_policy=bind:0
+
+; Interleave memory blocks (512K * 16) in numa node 0 and 1
+[job2]
+numa_cpu_nodes=0-1
+numa_mem_policy=interleave:0-1
+
diff --git a/examples/zipf b/examples/zipf
new file mode 100644
index 0000000..fcfa38d
--- /dev/null
+++ b/examples/zipf
@@ -0,0 +1,10 @@
+# Example job file for using a zipf distribution instead
+# of a purely random workload where each block is read
+# or written once.
+[job]
+ioengine=null
+rw=randread
+norandommap
+size=1280m
+bs=4k
+random_distribution=zipf:0.5
diff --git a/file.h b/file.h
index 42fd58c..38e9d0d 100644
--- a/file.h
+++ b/file.h
@@ -5,6 +5,7 @@
 #include "compiler/compiler.h"
 #include "io_ddir.h"
 #include "flist.h"
+#include "lib/zipf.h"
 
 /*
  * The type of object we are working on
@@ -112,6 +113,11 @@ struct fio_file {
 	unsigned long last_free_lookup;
 	unsigned failed_rands;
 
+	/*
+	 * Used for zipf random distribution
+	 */
+	struct zipf_state zipf;
+
 	int references;
 	enum fio_file_flags flags;
 
diff --git a/filesetup.c b/filesetup.c
index 79e29da..4a2383f 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -12,6 +12,7 @@
 #include "smalloc.h"
 #include "filehash.h"
 #include "os/os.h"
+#include "hash.h"
 
 #ifdef FIO_HAVE_LINUX_FALLOCATE
 #include <linux/falloc.h>
@@ -862,12 +863,50 @@ int pre_read_files(struct thread_data *td)
 	return 1;
 }
 
+static int __init_rand_distribution(struct thread_data *td, struct fio_file *f)
+{
+	unsigned int range_size, seed;
+	unsigned long nranges;
+
+	range_size = min(td->o.min_bs[DDIR_READ], td->o.min_bs[DDIR_WRITE]);
+
+	nranges = (f->real_file_size + range_size - 1) / range_size;
+
+	seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number;
+	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+		zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, seed);
+	else
+		pareto_init(&f->zipf, nranges, td->o.pareto_h.u.f, seed);
+
+	return 1;
+}
+
+static int init_rand_distribution(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+	int state;
+
+	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM)
+		return 0;
+
+	state = td->runstate;
+	td_set_runstate(td, TD_SETTING_UP);
+	for_each_file(td, f, i)
+		__init_rand_distribution(td, f);
+	td_set_runstate(td, state);
+
+	return 1;
+}
+
 int init_random_map(struct thread_data *td)
 {
 	unsigned long long blocks, num_maps;
 	struct fio_file *f;
 	unsigned int i;
 
+	if (init_rand_distribution(td))
+		return 0;
 	if (td->o.norandommap || !td_random(td))
 		return 0;
 
diff --git a/fio.1 b/fio.1
index 13abb94..8d3fedf 100644
--- a/fio.1
+++ b/fio.1
@@ -639,6 +639,10 @@ is used for read vs write seperation.
 Average bandwidth for \fBrate\fR and \fBratemin\fR over this number of
 milliseconds.  Default: 1000ms.
 .TP
+.BI max_latency \fR=\fPint
+If set, fio will exit the job if it exceeds this maximum latency. It will exit
+with an ETIME error.
+.TP
 .BI cpumask \fR=\fPint
 Set CPU affinity for this job. \fIint\fR is a bitmask of allowed CPUs the job
 may run on.  See \fBsched_setaffinity\fR\|(2).
@@ -646,6 +650,28 @@ may run on.  See \fBsched_setaffinity\fR\|(2).
 .BI cpus_allowed \fR=\fPstr
 Same as \fBcpumask\fR, but allows a comma-delimited list of CPU numbers.
 .TP
+.BI numa_cpu_nodes \fR=\fPstr
+Set this job running on spcified NUMA nodes' CPUs. The arguments allow
+comma delimited list of cpu numbers, A-B ranges, or 'all'.
+.TP
+.BI numa_mem_policy \fR=\fPstr
+Set this job's memory policy and corresponding NUMA nodes. Format of
+the argements:
+.RS
+.TP
+.B <mode>[:<nodelist>]
+.TP
+.B mode
+is one of the following memory policy:
+.TP
+.B default, prefer, bind, interleave, local
+.TP
+.RE
+For \fBdefault\fR and \fBlocal\fR memory policy, no \fBnodelist\fR is
+needed to be specified. For \fBprefer\fR, only one node is
+allowed. For \fBbind\fR and \fBinterleave\fR, \fBnodelist\fR allows
+comma delimited list of numbers, A-B ranges, or 'all'.
+.TP
 .BI startdelay \fR=\fPint
 Delay start of job for the specified number of seconds.
 .TP
diff --git a/fio.h b/fio.h
index 7f11861..5022cdf 100644
--- a/fio.h
+++ b/fio.h
@@ -50,6 +50,16 @@ struct thread_data;
 #include <sys/asynch.h>
 #endif
 
+#ifdef FIO_HAVE_LIBNUMA
+#include <linux/mempolicy.h>
+#include <numa.h>
+
+/*
+ * "local" is pseudo-policy
+ */
+#define MPOL_LOCAL MPOL_MAX
+#endif
+
 /*
  * offset generator types
  */
@@ -310,6 +320,7 @@ enum {
 
 extern int exitall_on_terminate;
 extern unsigned int thread_number;
+extern unsigned int stat_number;
 extern int shm_id;
 extern int groupid;
 extern int output_format;
@@ -344,7 +355,7 @@ static inline void fio_ro_check(struct thread_data *td, struct io_u *io_u)
 
 #define REAL_MAX_JOBS		2048
 
-static inline enum error_type td_error_type(enum fio_ddir ddir, int err)
+static inline enum error_type_bit td_error_type(enum fio_ddir ddir, int err)
 {
 	if (err == EILSEQ)
 		return ERROR_TYPE_VERIFY_BIT;
@@ -441,6 +452,7 @@ enum {
 	TD_NOT_CREATED = 0,
 	TD_CREATED,
 	TD_INITIALIZED,
+	TD_SETTING_UP,
 	TD_RAMP,
 	TD_RUNNING,
 	TD_PRE_READING,
@@ -578,4 +590,10 @@ enum {
 	FIO_OUTPUT_NORMAL,
 };
 
+enum {
+	FIO_RAND_DIST_RANDOM	= 0,
+	FIO_RAND_DIST_ZIPF,
+	FIO_RAND_DIST_PARETO,
+};
+
 #endif
diff --git a/gettime-thread.c b/gettime-thread.c
new file mode 100644
index 0000000..da40904
--- /dev/null
+++ b/gettime-thread.c
@@ -0,0 +1,78 @@
+#include <unistd.h>
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include "fio.h"
+#include "smalloc.h"
+
+struct timeval *fio_tv;
+int fio_gtod_offload = 0;
+int fio_gtod_cpu = -1;
+static pthread_t gtod_thread;
+
+void fio_gtod_init(void)
+{
+	fio_tv = smalloc(sizeof(struct timeval));
+	assert(fio_tv);
+}
+
+static void fio_gtod_update(void)
+{
+	gettimeofday(fio_tv, NULL);
+}
+
+static void *gtod_thread_main(void *data)
+{
+	struct fio_mutex *mutex = data;
+
+	fio_mutex_up(mutex);
+
+	/*
+	 * As long as we have jobs around, update the clock. It would be nice
+	 * to have some way of NOT hammering that CPU with gettimeofday(),
+	 * but I'm not sure what to use outside of a simple CPU nop to relax
+	 * it - we don't want to lose precision.
+	 */
+	while (threads) {
+		fio_gtod_update();
+		nop;
+	}
+
+	return NULL;
+}
+
+int fio_start_gtod_thread(void)
+{
+	struct fio_mutex *mutex;
+	pthread_attr_t attr;
+	int ret;
+
+	mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
+	if (!mutex)
+		return 1;
+
+	pthread_attr_init(&attr);
+	pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
+	ret = pthread_create(&gtod_thread, &attr, gtod_thread_main, NULL);
+	pthread_attr_destroy(&attr);
+	if (ret) {
+		log_err("Can't create gtod thread: %s\n", strerror(ret));
+		goto err;
+	}
+
+	ret = pthread_detach(gtod_thread);
+	if (ret) {
+		log_err("Can't detatch gtod thread: %s\n", strerror(ret));
+		goto err;
+	}
+
+	dprint(FD_MUTEX, "wait on startup_mutex\n");
+	fio_mutex_down(mutex);
+	dprint(FD_MUTEX, "done waiting on startup_mutex\n");
+err:
+	fio_mutex_remove(mutex);
+	return ret;
+}
+
+
diff --git a/gettime.c b/gettime.c
index 5b49287..35d685e 100644
--- a/gettime.c
+++ b/gettime.c
@@ -19,11 +19,6 @@ static unsigned long last_cycles;
 static struct timeval last_tv;
 static int last_tv_valid;
 
-static struct timeval *fio_tv;
-int fio_gtod_offload = 0;
-int fio_gtod_cpu = -1;
-static pthread_t gtod_thread;
-
 enum fio_cs fio_clock_source = FIO_PREFERRED_CLOCK_SOURCE;
 
 #ifdef FIO_DEBUG_TIME
@@ -267,66 +262,68 @@ void fio_clock_init(void)
 	calibrate_cpu_clock();
 }
 
-void fio_gtod_init(void)
+unsigned long long utime_since(struct timeval *s, struct timeval *e)
 {
-	fio_tv = smalloc(sizeof(struct timeval));
-	assert(fio_tv);
+	long sec, usec;
+	unsigned long long ret;
+
+	sec = e->tv_sec - s->tv_sec;
+	usec = e->tv_usec - s->tv_usec;
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+
+	/*
+	 * time warp bug on some kernels?
+	 */
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	ret = sec * 1000000ULL + usec;
+
+	return ret;
 }
 
-static void fio_gtod_update(void)
+unsigned long long utime_since_now(struct timeval *s)
 {
-	gettimeofday(fio_tv, NULL);
+	struct timeval t;
+
+	fio_gettime(&t, NULL);
+	return utime_since(s, &t);
 }
 
-static void *gtod_thread_main(void *data)
+unsigned long mtime_since(struct timeval *s, struct timeval *e)
 {
-	struct fio_mutex *mutex = data;
+	long sec, usec, ret;
 
-	fio_mutex_up(mutex);
-
-	/*
-	 * As long as we have jobs around, update the clock. It would be nice
-	 * to have some way of NOT hammering that CPU with gettimeofday(),
-	 * but I'm not sure what to use outside of a simple CPU nop to relax
-	 * it - we don't want to lose precision.
-	 */
-	while (threads) {
-		fio_gtod_update();
-		nop;
+	sec = e->tv_sec - s->tv_sec;
+	usec = e->tv_usec - s->tv_usec;
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
 	}
 
-	return NULL;
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	sec *= 1000UL;
+	usec /= 1000UL;
+	ret = sec + usec;
+
+	return ret;
 }
 
-int fio_start_gtod_thread(void)
+unsigned long mtime_since_now(struct timeval *s)
 {
-	struct fio_mutex *mutex;
-	pthread_attr_t attr;
-	int ret;
-
-	mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
-	if (!mutex)
-		return 1;
-
-	pthread_attr_init(&attr);
-	pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
-	ret = pthread_create(&gtod_thread, &attr, gtod_thread_main, NULL);
-	pthread_attr_destroy(&attr);
-	if (ret) {
-		log_err("Can't create gtod thread: %s\n", strerror(ret));
-		goto err;
-	}
+	struct timeval t;
+	void *p = __builtin_return_address(0);
 
-	ret = pthread_detach(gtod_thread);
-	if (ret) {
-		log_err("Can't detatch gtod thread: %s\n", strerror(ret));
-		goto err;
-	}
+	fio_gettime(&t, p);
+	return mtime_since(s, &t);
+}
 
-	dprint(FD_MUTEX, "wait on startup_mutex\n");
-	fio_mutex_down(mutex);
-	dprint(FD_MUTEX, "done waiting on startup_mutex\n");
-err:
-	fio_mutex_remove(mutex);
-	return ret;
+unsigned long time_since_now(struct timeval *s)
+{
+	return mtime_since_now(s) / 1000;
 }
diff --git a/gettime.h b/gettime.h
index 87cc895..309ef21 100644
--- a/gettime.h
+++ b/gettime.h
@@ -15,4 +15,6 @@ extern void fio_gtod_init(void);
 extern void fio_clock_init(void);
 extern int fio_start_gtod_thread(void);
 
+extern struct timeval *fio_tv;
+
 #endif
diff --git a/hash.h b/hash.h
index 4b8c6bf..13600f4 100644
--- a/hash.h
+++ b/hash.h
@@ -28,7 +28,9 @@
 #error Define GOLDEN_RATIO_PRIME for your wordsize.
 #endif
 
-static inline unsigned long hash_long(unsigned long val, unsigned int bits)
+#define GR_PRIME_64	0x9e37fffffffc0001UL
+
+static inline unsigned long __hash_long(unsigned long val)
 {
 	unsigned long hash = val;
 
@@ -52,8 +54,18 @@ static inline unsigned long hash_long(unsigned long val, unsigned int bits)
 	hash *= GOLDEN_RATIO_PRIME;
 #endif
 
+	return hash;
+}
+
+static inline unsigned long hash_long(unsigned long val, unsigned int bits)
+{
 	/* High bits are more random, so use them. */
-	return hash >> (BITS_PER_LONG - bits);
+	return __hash_long(val) >> (BITS_PER_LONG - bits);
+}
+
+static inline uint64_t __hash_u64(uint64_t val)
+{
+	return val * GR_PRIME_64;
 }
 	
 static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
diff --git a/init.c b/init.c
index 488101b..bdee8a2 100644
--- a/init.c
+++ b/init.c
@@ -324,6 +324,10 @@ static struct thread_data *get_new_job(int global, struct thread_data *parent,
 	profile_add_hooks(td);
 
 	td->thread_number = thread_number;
+
+	if (!parent || !parent->o.group_reporting)
+		stat_number++;
+
 	return td;
 }
 
diff --git a/io_u.c b/io_u.c
index 347e115..dcb56f1 100644
--- a/io_u.c
+++ b/io_u.c
@@ -157,8 +157,8 @@ static int get_next_free_block(struct thread_data *td, struct fio_file *f,
 	return 1;
 }
 
-static int get_next_rand_offset(struct thread_data *td, struct fio_file *f,
-				enum fio_ddir ddir, unsigned long long *b)
+static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f,
+				  enum fio_ddir ddir, unsigned long long *b)
 {
 	unsigned long long rmax, r, lastb;
 	int loops = 5;
@@ -234,6 +234,36 @@ ret:
 	return 0;
 }
 
+static int __get_next_rand_offset_zipf(struct thread_data *td,
+				       struct fio_file *f, enum fio_ddir ddir,
+				       unsigned long long *b)
+{
+	*b = zipf_next(&f->zipf);
+	return 0;
+}
+
+static int __get_next_rand_offset_pareto(struct thread_data *td,
+					 struct fio_file *f, enum fio_ddir ddir,
+					 unsigned long long *b)
+{
+	*b = pareto_next(&f->zipf);
+	return 0;
+}
+
+static int get_next_rand_offset(struct thread_data *td, struct fio_file *f,
+				enum fio_ddir ddir, unsigned long long *b)
+{
+	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM)
+		return __get_next_rand_offset(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+		return __get_next_rand_offset_zipf(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
+		return __get_next_rand_offset_pareto(td, f, ddir, b);
+
+	log_err("fio: unknown random distribution: %d\n", td->o.random_distribution);
+	return 1;
+}
+
 static int get_next_rand_block(struct thread_data *td, struct fio_file *f,
 			       enum fio_ddir ddir, unsigned long long *b)
 {
@@ -383,7 +413,7 @@ static inline int io_u_fits(struct thread_data *td, struct io_u *io_u,
 static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u)
 {
 	const int ddir = io_u->ddir;
-	unsigned int uninitialized_var(buflen);
+	unsigned int buflen = 0;
 	unsigned int minbs, maxbs;
 	unsigned long r, rand_max;
 
@@ -1315,7 +1345,7 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
 				  struct io_completion_data *icd,
 				  const enum fio_ddir idx, unsigned int bytes)
 {
-	unsigned long uninitialized_var(lusec);
+	unsigned long lusec = 0;
 
 	if (!td->o.disable_clat || !td->o.disable_bw)
 		lusec = utime_since(&io_u->issue_time, &icd->time);
@@ -1325,6 +1355,13 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
 
 		tusec = utime_since(&io_u->start_time, &icd->time);
 		add_lat_sample(td, idx, tusec, bytes);
+
+		if (td->o.max_latency && tusec > td->o.max_latency) {
+			if (!td->error)
+				log_err("fio: latency of %lu usec exceeds specified max (%u usec)\n", tusec, td->o.max_latency);
+			td_verror(td, ETIMEDOUT, "max latency exceeded");
+			icd->error = ETIMEDOUT;
+		}
 	}
 
 	if (!td->o.disable_clat) {
@@ -1351,11 +1388,6 @@ static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
 static void io_completed(struct thread_data *td, struct io_u *io_u,
 			 struct io_completion_data *icd)
 {
-	/*
-	 * Older gcc's are too dumb to realize that usec is always used
-	 * initialized, silence that warning.
-	 */
-	unsigned long uninitialized_var(usec);
 	struct fio_file *f;
 
 	dprint_io_u(io_u, "io complete");
diff --git a/json.c b/json.c
index 8efbbda..cdc3b21 100644
--- a/json.c
+++ b/json.c
@@ -57,13 +57,42 @@ static struct json_value *json_create_value_float(float number)
 	return value;
 }
 
+static char *strdup_escape(const char *str)
+{
+	const char *input = str;
+	char *p, *ret;
+	int escapes;
+
+	if (!strlen(str))
+		return NULL;
+
+	escapes = 0;
+	while ((input = strpbrk(input, "\\\"")) != NULL) {
+		escapes++;
+		input++;
+	}
+
+	p = ret = malloc(strlen(str) + escapes + 1);
+	while (*str) {
+		if (*str == '\\' || *str == '\"')
+			*p++ = '\\';
+		*p++ = *str++;
+	}
+	*p = '\0';
+
+	return ret;
+}
+
+/*
+ * Valid JSON strings must escape '"' and '/' with a preceeding '/'
+ */
 static struct json_value *json_create_value_string(const char *str)
 {
 	struct json_value *value = malloc(sizeof(struct json_value));
 
 	if (value) {
 		value->type = JSON_TYPE_STRING;
-		value->string = strdup(str);
+		value->string = strdup_escape(str);
 		if (!value->string) {
 			free(value);
 			value = NULL;
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 7cff649..883bc72 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -300,3 +300,34 @@ struct rb_node *rb_first(struct rb_root *root)
 		n = n->rb_left;
 	return n;
 }
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (RB_EMPTY_NODE(node))
+		return NULL;
+
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
+	if (node->rb_right) {
+		node = node->rb_right; 
+		while (node->rb_left)
+			node=node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
diff --git a/lib/rbtree.h b/lib/rbtree.h
index 7563725..c6cfe4a 100644
--- a/lib/rbtree.h
+++ b/lib/rbtree.h
@@ -141,6 +141,7 @@ extern void rb_erase(struct rb_node *, struct rb_root *);
 
 /* Find logical next and previous nodes in a tree */
 extern struct rb_node *rb_first(struct rb_root *);
+extern struct rb_node *rb_next(const struct rb_node *);
 
 static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
 				struct rb_node ** rb_link)
diff --git a/lib/zipf.c b/lib/zipf.c
new file mode 100644
index 0000000..9b6ce63
--- /dev/null
+++ b/lib/zipf.c
@@ -0,0 +1,88 @@
+#include <math.h>
+#include <string.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include "ieee754.h"
+#include "../log.h"
+#include "zipf.h"
+#include "../minmax.h"
+#include "../hash.h"
+
+#define ZIPF_MAX_GEN	10000000
+
+static void zipf_update(struct zipf_state *zs)
+{
+	unsigned long to_gen;
+	unsigned int i;
+
+	/*
+	 * It can become very costly to generate long sequences. Just cap it at
+	 * 10M max, that should be doable in 1-2s on even slow machines.
+	 * Precision will take a slight hit, but nothing major.
+	 */
+	to_gen = min(zs->nranges, ZIPF_MAX_GEN);
+
+	for (i = 0; i < to_gen; i++)
+		zs->zetan += pow(1.0 / (double) (i + 1), zs->theta);
+}
+
+static void shared_rand_init(struct zipf_state *zs, unsigned long nranges,
+			     unsigned int seed)
+{
+	memset(zs, 0, sizeof(*zs));
+	zs->nranges = nranges;
+
+	init_rand_seed(&zs->rand, seed);
+	zs->rand_off = __rand(&zs->rand);
+}
+
+void zipf_init(struct zipf_state *zs, unsigned long nranges, double theta,
+	       unsigned int seed)
+{
+	shared_rand_init(zs, nranges, seed);
+
+	zs->theta = theta;
+	zs->zeta2 = pow(1.0, zs->theta) + pow(0.5, zs->theta);
+
+	zipf_update(zs);
+}
+
+unsigned long long zipf_next(struct zipf_state *zs)
+{
+	double alpha, eta, rand_uni, rand_z;
+	unsigned long long n = zs->nranges;
+	unsigned long long val;
+
+	alpha = 1.0 / (1.0 - zs->theta);
+	eta = (1.0 - pow(2.0 / n, 1.0 - zs->theta)) / (1.0 - zs->zeta2 / zs->zetan);
+
+	rand_uni = (double) __rand(&zs->rand) / (double) FRAND_MAX;
+	rand_z = rand_uni * zs->zetan;
+
+	if (rand_z < 1.0)
+		val = 1;
+	else if (rand_z < (1.0 + pow(0.5, zs->theta)))
+		val = 2;
+	else
+		val = 1 + (unsigned long long)(n * pow(eta*rand_uni - eta + 1.0, alpha));
+
+	return (__hash_u64(val - 1) + zs->rand_off) % zs->nranges;
+}
+
+void pareto_init(struct zipf_state *zs, unsigned long nranges, double h,
+		 unsigned int seed)
+{
+	shared_rand_init(zs, nranges, seed);
+	zs->pareto_pow = log(h) / log(1.0 - h);
+}
+
+unsigned long long pareto_next(struct zipf_state *zs)
+{
+	double rand = (double) __rand(&zs->rand) / (double) FRAND_MAX;
+	unsigned long long n = zs->nranges - 1;
+
+	return (__hash_u64(n * pow(rand, zs->pareto_pow)) + zs->rand_off) % zs->nranges;
+}
diff --git a/lib/zipf.h b/lib/zipf.h
new file mode 100644
index 0000000..f98ad81
--- /dev/null
+++ b/lib/zipf.h
@@ -0,0 +1,23 @@
+#ifndef FIO_ZIPF_H
+#define FIO_ZIPF_H
+
+#include <inttypes.h>
+#include "rand.h"
+
+struct zipf_state {
+	uint64_t nranges;
+	double theta;
+	double zeta2;
+	double zetan;
+	double pareto_pow;
+	struct frand_state rand;
+	uint64_t rand_off;
+};
+
+void zipf_init(struct zipf_state *zs, unsigned long nranges, double theta, unsigned int seed);
+unsigned long long zipf_next(struct zipf_state *zs);
+
+void pareto_init(struct zipf_state *zs, unsigned long nranges, double h, unsigned int seed);
+unsigned long long pareto_next(struct zipf_state *zs);
+
+#endif
diff --git a/minmax.h b/minmax.h
new file mode 100644
index 0000000..e5c2f58
--- /dev/null
+++ b/minmax.h
@@ -0,0 +1,11 @@
+#ifndef FIO_MIN_MAX_H
+#define FIO_MIN_MAX_H
+
+#ifndef min
+#define min(a, b)	((a) < (b) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b)	((a) > (b) ? (a) : (b))
+#endif
+
+#endif
diff --git a/options.c b/options.c
index eb7c596..9ee6060 100644
--- a/options.c
+++ b/options.c
@@ -513,6 +513,130 @@ static int str_verify_cpus_allowed_cb(void *data, const char *input)
 }
 #endif
 
+#ifdef FIO_HAVE_LIBNUMA
+static int str_numa_cpunodes_cb(void *data, char *input)
+{
+	struct thread_data *td = data;
+
+	/* numa_parse_nodestring() parses a character string list
+	 * of nodes into a bit mask. The bit mask is allocated by
+	 * numa_allocate_nodemask(), so it should be freed by
+	 * numa_free_nodemask().
+	 */
+	td->o.numa_cpunodesmask = numa_parse_nodestring(input);
+	if (td->o.numa_cpunodesmask == NULL) {
+		log_err("fio: numa_parse_nodestring failed\n");
+		td_verror(td, 1, "str_numa_cpunodes_cb");
+		return 1;
+	}
+
+	td->o.numa_cpumask_set = 1;
+	return 0;
+}
+
+static int str_numa_mpol_cb(void *data, char *input)
+{
+	struct thread_data *td = data;
+	const char * const policy_types[] =
+		{ "default", "prefer", "bind", "interleave", "local" };
+	int i;
+
+	char *nodelist = strchr(input, ':');
+	if (nodelist) {
+		/* NUL-terminate mode */
+		*nodelist++ = '\0';
+	}
+
+	for (i = 0; i <= MPOL_LOCAL; i++) {
+		if (!strcmp(input, policy_types[i])) {
+			td->o.numa_mem_mode = i;
+			break;
+		}
+	}
+	if (i > MPOL_LOCAL) {
+		log_err("fio: memory policy should be: default, prefer, bind, interleave, local\n");
+		goto out;
+	}
+
+	switch (td->o.numa_mem_mode) {
+	case MPOL_PREFERRED:
+		/*
+		 * Insist on a nodelist of one node only
+		 */
+		if (nodelist) {
+			char *rest = nodelist;
+			while (isdigit(*rest))
+				rest++;
+			if (*rest) {
+				log_err("fio: one node only for \'prefer\'\n");
+				goto out;
+			}
+		} else {
+			log_err("fio: one node is needed for \'prefer\'\n");
+			goto out;
+		}
+		break;
+	case MPOL_INTERLEAVE:
+		/*
+		 * Default to online nodes with memory if no nodelist
+		 */
+		if (!nodelist)
+			nodelist = strdup("all");
+		break;
+	case MPOL_LOCAL:
+	case MPOL_DEFAULT:
+		/*
+		 * Don't allow a nodelist
+		 */
+		if (nodelist) {
+			log_err("fio: NO nodelist for \'local\'\n");
+			goto out;
+		}
+		break;
+	case MPOL_BIND:
+		/*
+		 * Insist on a nodelist
+		 */
+		if (!nodelist) {
+			log_err("fio: a nodelist is needed for \'bind\'\n");
+			goto out;
+		}
+		break;
+	}
+
+
+	/* numa_parse_nodestring() parses a character string list
+	 * of nodes into a bit mask. The bit mask is allocated by
+	 * numa_allocate_nodemask(), so it should be freed by
+	 * numa_free_nodemask().
+	 */
+	switch (td->o.numa_mem_mode) {
+	case MPOL_PREFERRED:
+		td->o.numa_mem_prefer_node = atoi(nodelist);
+		break;
+	case MPOL_INTERLEAVE:
+	case MPOL_BIND:
+		td->o.numa_memnodesmask = numa_parse_nodestring(nodelist);
+		if (td->o.numa_memnodesmask == NULL) {
+			log_err("fio: numa_parse_nodestring failed\n");
+			td_verror(td, 1, "str_numa_memnodes_cb");
+			return 1;
+		}
+		break;
+	case MPOL_LOCAL:
+	case MPOL_DEFAULT:
+	default:
+		break;
+	}
+
+	td->o.numa_memmask_set = 1;
+	return 0;
+
+out:
+	return 1;
+}
+#endif
+
 static int str_fst_cb(void *data, const char *str)
 {
 	struct thread_data *td = data;
@@ -543,6 +667,45 @@ static int str_sfr_cb(void *data, const char *str)
 }
 #endif
 
+static int str_random_distribution_cb(void *data, const char *str)
+{
+	struct thread_data *td = data;
+	double val;
+	char *nr;
+
+	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+		val = 1.1;
+	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
+		val = 0.2;
+	else
+		return 0;
+
+	nr = get_opt_postfix(str);
+	if (nr && !str_to_float(nr, &val)) {
+		log_err("fio: random postfix parsing failed\n");
+		free(nr);
+		return 1;
+	}
+
+	free(nr);
+
+	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF) {
+		if (val == 1.00) {
+			log_err("fio: zipf theta must different than 1.0\n");
+			return 1;
+		}
+		td->o.zipf_theta.u.f = val;
+	} else {
+		if (val <= 0.00 || val >= 1.00) {
+			log_err("fio: pareto input out of range (0 < input < 1.0)\n");
+			return 1;
+		}
+		td->o.pareto_h.u.f = val;
+	}
+
+	return 0;
+}
+
 /*
  * Return next file in the string. Files are separated with ':'. If the ':'
  * is escaped with a '\', then that ':' is part of the filename and does not
@@ -1452,6 +1615,28 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.group	= FIO_OPT_G_RANDOM,
 	},
 	{
+		.name	= "random_distribution",
+		.type	= FIO_OPT_STR,
+		.off1	= td_var_offset(random_distribution),
+		.cb	= str_random_distribution_cb,
+		.help	= "Random offset distribution generator",
+		.def	= "random",
+		.posval	= {
+			  { .ival = "random",
+			    .oval = FIO_RAND_DIST_RANDOM,
+			    .help = "Completely random",
+			  },
+			  { .ival = "zipf",
+			    .oval = FIO_RAND_DIST_ZIPF,
+			    .help = "Zipf distribution",
+			  },
+			  { .ival = "pareto",
+			    .oval = FIO_RAND_DIST_PARETO,
+			    .help = "Pareto distribution",
+			  },
+		},
+	},
+	{
 		.name	= "nrfiles",
 		.lname	= "Number of files",
 		.alias	= "nr_files",
@@ -2312,6 +2497,14 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.group	= FIO_OPT_G_RATE,
 	},
 	{
+		.name	= "max_latency",
+		.type	= FIO_OPT_INT,
+		.off1	= td_var_offset(max_latency),
+		.help	= "Maximum tolerated IO latency (usec)",
+		.category = FIO_OPT_C_IO,
+		.group = FIO_OPT_G_RATE,
+	},
+	{
 		.name	= "invalidate",
 		.lname	= "Cache invalidate",
 		.type	= FIO_OPT_BOOL,
@@ -2401,6 +2594,20 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.group	= FIO_OPT_G_CRED,
 	},
 #endif
+#ifdef FIO_HAVE_LIBNUMA
+	{
+		.name	= "numa_cpu_nodes",
+		.type	= FIO_OPT_STR,
+		.cb	= str_numa_cpunodes_cb,
+		.help	= "NUMA CPU nodes bind",
+	},
+	{
+		.name	= "numa_mem_policy",
+		.type	= FIO_OPT_STR,
+		.cb	= str_numa_mpol_cb,
+		.help	= "NUMA memory policy setup",
+	},
+#endif
 	{
 		.name	= "end_fsync",
 		.lname	= "End fsync",
diff --git a/parse.c b/parse.c
index 9a6494f..ffe2dc0 100644
--- a/parse.c
+++ b/parse.c
@@ -14,6 +14,7 @@
 #include "parse.h"
 #include "debug.h"
 #include "options.h"
+#include "minmax.h"
 
 static struct fio_option *__fio_options;
 
@@ -219,7 +220,7 @@ static unsigned long long get_mult_bytes(const char *str, int len, void *data,
 /*
  * Convert string into a floating number. Return 1 for success and 0 otherwise.
  */
-static int str_to_float(const char *str, double *val)
+int str_to_float(const char *str, double *val)
 {
 	return (1 == sscanf(str, "%lf", val));
 }
diff --git a/parse.h b/parse.h
index 7fee4fa..b9da7b9 100644
--- a/parse.h
+++ b/parse.h
@@ -89,6 +89,7 @@ extern void strip_blank_front(char **);
 extern void strip_blank_end(char *);
 extern int str_to_decimal(const char *, long long *, int, void *);
 extern int check_str_bytes(const char *p, long long *val, void *data);
+extern int str_to_float(const char *str, double *val);
 
 /*
  * Handlers for the options
@@ -100,13 +101,6 @@ typedef int (fio_opt_str_set_fn)(void *);
 
 #define td_var(start, offset)	((void *) start + (offset))
 
-#ifndef min
-#define min(a, b)	((a) < (b) ? (a) : (b))
-#endif
-#ifndef max
-#define max(a, b)	((a) > (b) ? (a) : (b))
-#endif
-
 static inline int parse_is_percent(unsigned long long val)
 {
 	return val <= -1ULL && val >= (-1ULL - 100ULL);
diff --git a/server.c b/server.c
index d120c52..6447591 100644
--- a/server.c
+++ b/server.c
@@ -561,12 +561,15 @@ static int handle_job_cmd(struct fio_net_cmd *cmd)
 	pdu->buf_len = le32_to_cpu(pdu->buf_len);
 	pdu->client_type = le32_to_cpu(pdu->client_type);
 
+	stat_number = 0;
+
 	if (parse_jobs_ini(buf, 1, 0, pdu->client_type)) {
 		fio_net_send_quit(server_fd);
 		return -1;
 	}
 
 	spdu.jobs = cpu_to_le32(thread_number);
+	spdu.stat_outputs = cpu_to_le32(stat_number);
 	fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
 	return 0;
 }
@@ -597,6 +600,8 @@ static int handle_jobline_cmd(struct fio_net_cmd *cmd)
 		dprint(FD_NET, "server: %d: %s\n", i, argv[i]);
 	}
 
+	stat_number = 0;
+
 	if (parse_cmd_line(clp->lines, argv, clp->client_type)) {
 		fio_net_send_quit(server_fd);
 		free(argv);
@@ -606,6 +611,7 @@ static int handle_jobline_cmd(struct fio_net_cmd *cmd)
 	free(argv);
 
 	spdu.jobs = cpu_to_le32(thread_number);
+	spdu.stat_outputs = cpu_to_le32(stat_number);
 	fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
 	return 0;
 }
diff --git a/server.h b/server.h
index 5273fd1..201e62d 100644
--- a/server.h
+++ b/server.h
@@ -38,7 +38,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-	FIO_SERVER_VER			= 18,
+	FIO_SERVER_VER			= 19,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 
@@ -113,6 +113,7 @@ struct cmd_job_pdu {
 
 struct cmd_start_pdu {
 	uint32_t jobs;
+	uint32_t stat_outputs;
 };
 
 struct cmd_end_pdu {
diff --git a/smalloc.c b/smalloc.c
index d0b6f1e..b017373 100644
--- a/smalloc.c
+++ b/smalloc.c
@@ -36,14 +36,14 @@ struct pool {
 	struct fio_mutex *lock;			/* protects this pool */
 	void *map;				/* map of blocks */
 	unsigned int *bitmap;			/* blocks free/busy map */
-	unsigned int free_blocks;		/* free blocks */
-	unsigned int nr_blocks;			/* total blocks */
-	unsigned int next_non_full;
-	unsigned int mmap_size;
+	size_t free_blocks;		/* free blocks */
+	size_t nr_blocks;			/* total blocks */
+	size_t next_non_full;
+	size_t mmap_size;
 };
 
 struct block_hdr {
-	unsigned int size;
+	size_t size;
 #ifdef SMALLOC_REDZONE
 	unsigned int prered;
 #endif
@@ -91,13 +91,13 @@ static inline int ptr_valid(struct pool *pool, void *ptr)
 	return (ptr >= pool->map) && (ptr < pool->map + pool_size);
 }
 
-static inline unsigned int size_to_blocks(unsigned int size)
+static inline size_t size_to_blocks(size_t size)
 {
 	return (size + SMALLOC_BPB - 1) / SMALLOC_BPB;
 }
 
 static int blocks_iter(struct pool *pool, unsigned int pool_idx,
-		       unsigned int idx, unsigned int nr_blocks,
+		       unsigned int idx, size_t nr_blocks,
 		       int (*func)(unsigned int *map, unsigned int mask))
 {
 
@@ -152,19 +152,19 @@ static int mask_set(unsigned int *map, unsigned int mask)
 }
 
 static int blocks_free(struct pool *pool, unsigned int pool_idx,
-		       unsigned int idx, unsigned int nr_blocks)
+		       unsigned int idx, size_t nr_blocks)
 {
 	return blocks_iter(pool, pool_idx, idx, nr_blocks, mask_cmp);
 }
 
 static void set_blocks(struct pool *pool, unsigned int pool_idx,
-		       unsigned int idx, unsigned int nr_blocks)
+		       unsigned int idx, size_t nr_blocks)
 {
 	blocks_iter(pool, pool_idx, idx, nr_blocks, mask_set);
 }
 
 static void clear_blocks(struct pool *pool, unsigned int pool_idx,
-			 unsigned int idx, unsigned int nr_blocks)
+			 unsigned int idx, size_t nr_blocks)
 {
 	blocks_iter(pool, pool_idx, idx, nr_blocks, mask_clear);
 }
@@ -348,9 +348,9 @@ void sfree(void *ptr)
 	sfree_pool(pool, ptr);
 }
 
-static void *__smalloc_pool(struct pool *pool, unsigned int size)
+static void *__smalloc_pool(struct pool *pool, size_t size)
 {
-	unsigned int nr_blocks;
+	size_t nr_blocks;
 	unsigned int i;
 	unsigned int offset;
 	unsigned int last_idx;
@@ -403,9 +403,9 @@ fail:
 	return ret;
 }
 
-static void *smalloc_pool(struct pool *pool, unsigned int size)
+static void *smalloc_pool(struct pool *pool, size_t size)
 {
-	unsigned int alloc_size = size + sizeof(struct block_hdr);
+	size_t alloc_size = size + sizeof(struct block_hdr);
 	void *ptr;
 
 	/*
@@ -431,10 +431,13 @@ static void *smalloc_pool(struct pool *pool, unsigned int size)
 	return ptr;
 }
 
-void *smalloc(unsigned int size)
+void *smalloc(size_t size)
 {
 	unsigned int i;
 
+	if (size != (unsigned int) size)
+		return NULL;
+
 	global_write_lock();
 	i = last_pool;
 
diff --git a/smalloc.h b/smalloc.h
index 6905c6a..f9a5e41 100644
--- a/smalloc.h
+++ b/smalloc.h
@@ -1,7 +1,7 @@
 #ifndef FIO_SMALLOC_H
 #define FIO_SMALLOC_H
 
-extern void *smalloc(unsigned int);
+extern void *smalloc(size_t);
 extern void sfree(void *);
 extern char *smalloc_strdup(const char *);
 extern void sinit(void);
diff --git a/stat.c b/stat.c
index 4a881d4..2e78242 100644
--- a/stat.c
+++ b/stat.c
@@ -16,7 +16,11 @@ void update_rusage_stat(struct thread_data *td)
 {
 	struct thread_stat *ts = &td->ts;
 
+#ifdef RUSAGE_THREAD
+	getrusage(RUSAGE_THREAD, &td->ru_end);
+#else
 	getrusage(RUSAGE_SELF, &td->ru_end);
+#endif
 
 	ts->usr_time += mtime_since(&td->ru_start.ru_utime,
 					&td->ru_end.ru_utime);
diff --git a/t/genzipf.c b/t/genzipf.c
new file mode 100644
index 0000000..2d1b107
--- /dev/null
+++ b/t/genzipf.c
@@ -0,0 +1,287 @@
+/*
+ * Generate/analyze pareto/zipf distributions to better understand
+ * what an access pattern would look like.
+ *
+ * For instance, the following would generate a zipf distribution
+ * with theta 1.2, using 100,000 values and split the reporting into
+ * 20 buckets:
+ *
+ *	t/genzipf zipf 1.2 100000 20
+ *
+ * Only the distribution type (zipf or pareto) and spread input need
+ * to be given, if not given defaults are used.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../lib/zipf.h"
+#include "../flist.h"
+#include "../hash.h"
+#include "../rbtree.h"
+
+#define DEF_NR		1000000
+#define DEF_NR_OUTPUT	23
+
+struct node {
+	struct flist_head list;
+	unsigned long long val;
+	unsigned long hits;
+};
+
+static struct flist_head *hash;
+static unsigned long hash_bits = 24;
+static unsigned long hash_size = 1 << 24;
+
+enum {
+	TYPE_NONE = 0,
+	TYPE_ZIPF,
+	TYPE_PARETO,
+};
+static const char *dist_types[] = { "None", "Zipf", "Pareto" };
+
+static int dist_type = TYPE_ZIPF;
+static unsigned long gb_size = 500;
+static unsigned long block_size = 4096;
+static unsigned long output_nranges = DEF_NR_OUTPUT;
+static double percentage;
+static double dist_val;
+
+#define DEF_ZIPF_VAL	1.2
+#define DEF_PARETO_VAL	0.3
+
+static struct node *hash_lookup(unsigned long long val)
+{
+	struct flist_head *l = &hash[hash_long(val, hash_bits)];
+	struct flist_head *entry;
+	struct node *n;
+
+	flist_for_each(entry, l) {
+		n = flist_entry(entry, struct node, list);
+		if (n->val == val)
+			return n;
+	}
+
+	return NULL;
+}
+
+static struct node *hash_insert(struct node *n, unsigned long long val)
+{
+	struct flist_head *l = &hash[hash_long(val, hash_bits)];
+
+	n->val = val;
+	n->hits = 1;
+	flist_add_tail(&n->list, l);
+	return n;
+}
+
+static int parse_options(int argc, char *argv[])
+{
+	const char *optstring = "t:g:i:o:b:p:";
+	int c, dist_val_set = 0;
+
+	while ((c = getopt(argc, argv, optstring)) != -1) {
+		switch (c) {
+		case 'p':
+			percentage = atof(optarg);
+			break;
+		case 'b':
+			block_size = strtoul(optarg, NULL, 10);
+			break;
+		case 't':
+			if (!strncmp(optarg, "zipf", 4))
+				dist_type = TYPE_ZIPF;
+			else if (!strncmp(optarg, "pareto", 6))
+				dist_type = TYPE_PARETO;
+			else {
+				printf("wrong dist type: %s\n", optarg);
+				return 1;
+			}
+			break;
+		case 'g':
+			gb_size = strtoul(optarg, NULL, 10);
+			break;
+		case 'i':
+			dist_val = atof(optarg);
+			dist_val_set = 1;
+			break;
+		case 'o':
+			output_nranges = strtoul(optarg, NULL, 10);
+			break;
+		default:
+			printf("bad option %c\n", c);
+			return 1;
+		}
+	}
+
+	if (dist_type == TYPE_PARETO) {
+		if ((dist_val >= 1.00 || dist_val < 0.00)) {
+			printf("pareto input must be > 0.00 and < 1.00\n");
+			return 1;
+		}
+		if (!dist_val_set)
+			dist_val = DEF_PARETO_VAL;
+	} else if (dist_type == TYPE_ZIPF) {
+		if (dist_val == 1.0) {
+			printf("zipf input must be different than 1.0\n");
+			return 1;
+		}
+		if (!dist_val_set)
+			dist_val = DEF_ZIPF_VAL;
+	}
+
+	return 0;
+}
+
+struct output_sum {
+	double output;
+	unsigned int nranges;
+};
+
+static int node_cmp(const void *p1, const void *p2)
+{
+	const struct node *n1 = p1;
+	const struct node *n2 = p2;
+
+	return n2->hits - n1->hits;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long offset;
+	unsigned long i, j, k, nr_vals, cur_vals, interval, total_vals, nnodes;
+	unsigned long long nranges;
+	struct output_sum *output_sums;
+	struct node *nodes;
+	double perc, perc_i;
+	struct zipf_state zs;
+
+	if (parse_options(argc, argv))
+		return 1;
+
+	printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size);
+
+	nranges = gb_size * 1024 * 1024 * 1024ULL;
+	nranges /= block_size;
+
+	if (dist_type == TYPE_ZIPF)
+		zipf_init(&zs, nranges, dist_val, 1);
+	else
+		pareto_init(&zs, nranges, dist_val, 1);
+
+	hash_bits = 0;
+	hash_size = nranges;
+	while ((hash_size >>= 1) != 0)
+		hash_bits++;
+
+	hash_size = 1 << hash_bits;
+
+	hash = malloc(hash_size * sizeof(struct flist_head));
+	for (i = 0; i < hash_size; i++)
+		INIT_FLIST_HEAD(&hash[i]);
+
+	nodes = malloc(nranges * sizeof(struct node));
+
+	for (nr_vals = i = j = 0; i < nranges; i++) {
+		struct node *n;
+
+		if (dist_type == TYPE_ZIPF)
+			offset = zipf_next(&zs);
+		else
+			offset = pareto_next(&zs);
+
+		n = hash_lookup(offset);
+		if (n)
+			n->hits++;
+		else {
+			hash_insert(&nodes[j], offset);
+			j++;
+		}
+
+		nr_vals++;
+	}
+
+	qsort(nodes, j, sizeof(struct node), node_cmp);
+	nnodes = j;
+	nr_vals = nnodes;
+
+	interval = (nr_vals + output_nranges - 1) / output_nranges;
+
+	output_sums = malloc(output_nranges * sizeof(struct output_sum));
+	for (i = 0; i < output_nranges; i++) {
+		output_sums[i].output = 0.0;
+		output_sums[i].nranges = 1;
+	}
+
+	total_vals = i = j = cur_vals = 0;
+	
+	for (k = 0; k < nnodes; k++) {
+		struct output_sum *os = &output_sums[j];
+		struct node *node = &nodes[k];
+
+		if (i >= interval) {
+			os->output = (double) (cur_vals + 1) / (double) nranges;
+			os->output *= 100.0;
+			j++;
+			cur_vals = node->hits;
+			interval += (nr_vals + output_nranges - 1) / output_nranges;
+		} else {
+			cur_vals += node->hits;
+			os->nranges += node->hits;
+		}
+
+		i++;
+		total_vals += node->hits;
+
+		if (percentage) {
+			unsigned long blocks = percentage * nranges / 100;
+
+			if (total_vals >= blocks) {
+				double cs = i * block_size / (1024 * 1024);
+				char p = 'M';
+
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'G';
+				}
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'T';
+				}
+
+				printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p);
+				percentage = 0.0;
+			}
+		}
+	}
+
+	perc_i = 100.0 / (double) output_nranges;
+	perc = 0.0;
+
+	printf("\n   Rows           Hits           No Hits         Size\n");
+	printf("--------------------------------------------------------\n");
+	for (i = 0; i < j; i++) {
+		struct output_sum *os = &output_sums[i];
+		double gb = (double) os->nranges * block_size / 1024.0;
+		char p = 'K';
+
+		if (gb > 1024.0) {
+			p = 'M';
+			gb /= 1024.0;
+		}
+		if (gb > 1024.0) {
+			p = 'G';
+			gb /= 1024.0;
+		}
+
+		perc += perc_i;
+		printf("%s %6.2f%%\t%6.2f%%\t\t%8u\t%6.2f%c\n", i ? "|->" : "Top", perc, os->output, os->nranges, gb, p);
+	}
+
+	free(output_sums);
+	free(hash);
+	return 0;
+}
diff --git a/t/log.c b/t/log.c
index 7f1de27..ac02303 100644
--- a/t/log.c
+++ b/t/log.c
@@ -13,3 +13,16 @@ int log_err(const char *format, ...)
 
 	return fwrite(buffer, len, 1, stderr);
 }
+
+int log_info(const char *format, ...)
+{
+	char buffer[1024];
+	va_list args;
+	size_t len;
+
+	va_start(args, format);
+	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	va_end(args);
+
+	return fwrite(buffer, len, 1, stdout);
+}
diff --git a/t/stest.c b/t/stest.c
index c179484..0da8f2c 100644
--- a/t/stest.c
+++ b/t/stest.c
@@ -6,6 +6,8 @@
 #include "../flist.h"
 
 FILE *f_err;
+struct timeval *fio_tv = NULL;
+unsigned int fio_debug = 0;
 
 #define MAGIC1	0xa9b1c8d2
 #define MAGIC2	0xf0a1e9b3
@@ -82,3 +84,7 @@ int main(int argc, char *argv[])
 	scleanup();
 	return 0;
 }
+
+void __dprint(int type, const char *str, ...)
+{
+}
diff --git a/thread_options.h b/thread_options.h
index 9b90796..9975af1 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -119,6 +119,10 @@ struct thread_options {
 	unsigned int bs_unaligned;
 	unsigned int fsync_on_close;
 
+	unsigned int random_distribution;
+	fio_fp64_t zipf_theta;
+	fio_fp64_t pareto_h;
+
 	unsigned int hugepage_size;
 	unsigned int rw_min_bs;
 	unsigned int thinktime;
@@ -141,6 +145,8 @@ struct thread_options {
 	enum fio_memtype mem_type;
 	unsigned int mem_align;
 
+	unsigned max_latency;
+
 	unsigned int stonewall;
 	unsigned int new_group;
 	unsigned int numjobs;
@@ -148,6 +154,14 @@ struct thread_options {
 	unsigned int cpumask_set;
 	os_cpu_mask_t verify_cpumask;
 	unsigned int verify_cpumask_set;
+#ifdef FIO_HAVE_LIBNUMA
+	struct bitmask *numa_cpunodesmask;
+	unsigned int numa_cpumask_set;
+	unsigned short numa_mem_mode;
+	unsigned int numa_mem_prefer_node;
+	struct bitmask *numa_memnodesmask;
+	unsigned int numa_memmask_set;
+#endif
 	unsigned int iolog;
 	unsigned int rwmixcycle;
 	unsigned int rwmix[2];
@@ -308,6 +322,10 @@ struct thread_options_pack {
 	uint32_t bs_unaligned;
 	uint32_t fsync_on_close;
 
+	uint32_t random_distribution;
+	fio_fp64_t zipf_theta;
+	fio_fp64_t pareto_h;
+
 	uint32_t hugepage_size;
 	uint32_t rw_min_bs;
 	uint32_t thinktime;
@@ -330,6 +348,8 @@ struct thread_options_pack {
 	uint32_t mem_type;
 	uint32_t mem_align;
 
+	uint32_t max_latency;
+
 	uint32_t stonewall;
 	uint32_t new_group;
 	uint32_t numjobs;
diff --git a/time.c b/time.c
index 4af84bc..c4d1d4c 100644
--- a/time.c
+++ b/time.c
@@ -6,72 +6,6 @@
 static struct timeval genesis;
 static unsigned long ns_granularity;
 
-unsigned long long utime_since(struct timeval *s, struct timeval *e)
-{
-	long sec, usec;
-	unsigned long long ret;
-
-	sec = e->tv_sec - s->tv_sec;
-	usec = e->tv_usec - s->tv_usec;
-	if (sec > 0 && usec < 0) {
-		sec--;
-		usec += 1000000;
-	}
-
-	/*
-	 * time warp bug on some kernels?
-	 */
-	if (sec < 0 || (sec == 0 && usec < 0))
-		return 0;
-
-	ret = sec * 1000000ULL + usec;
-
-	return ret;
-}
-
-unsigned long long utime_since_now(struct timeval *s)
-{
-	struct timeval t;
-
-	fio_gettime(&t, NULL);
-	return utime_since(s, &t);
-}
-
-unsigned long mtime_since(struct timeval *s, struct timeval *e)
-{
-	long sec, usec, ret;
-
-	sec = e->tv_sec - s->tv_sec;
-	usec = e->tv_usec - s->tv_usec;
-	if (sec > 0 && usec < 0) {
-		sec--;
-		usec += 1000000;
-	}
-
-	if (sec < 0 || (sec == 0 && usec < 0))
-		return 0;
-
-	sec *= 1000UL;
-	usec /= 1000UL;
-	ret = sec + usec;
-
-	return ret;
-}
-
-unsigned long mtime_since_now(struct timeval *s)
-{
-	struct timeval t;
-	void *p = __builtin_return_address(0);
-
-	fio_gettime(&t, p);
-	return mtime_since(s, &t);
-}
-
-unsigned long time_since_now(struct timeval *s)
-{
-	return mtime_since_now(s) / 1000;
-}
-
 /*
  * busy looping version for the last few usec
  */
--
To unsubscribe from this list: send the line "unsubscribe fio" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html