[PATCH 2/2] rt-tests: oslat: Init commit

Peter Xu <peterx@xxxxxxxxxx> · Mon, 17 Aug 2020 17:55:05 -0400

oslat was initially a standalone program [1].  This patch merges oslat into
rt-tests repo.

This is a direct port of oslat v0.1.7 into rt-tests.  It naturally bumps the
version to latest rt-tests version.

[1] https://github.com/xzpeter/oslat

Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
---
 .gitignore        |   1 +
 Makefile          |  10 +-
 src/oslat/oslat.8 |  66 ++++
 src/oslat/oslat.c | 896 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 971 insertions(+), 2 deletions(-)
 create mode 100644 src/oslat/oslat.8
 create mode 100644 src/oslat/oslat.c

diff --git a/.gitignore b/.gitignore
index bc01575..a975c4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,6 +39,7 @@ SRPMS
 /queuelat
 /ssdd
 /get_cyclictest_snapshot
+/oslat
 
 rt-tests.spec
 tags
diff --git a/Makefile b/Makefile
index be78312..3f59efb 100644
--- a/Makefile
+++ b/Makefile
@@ -17,7 +17,8 @@ sources = cyclictest.c \
 	  cyclicdeadline.c \
 	  deadline_test.c \
 	  queuelat.c \
-	  ssdd.c
+	  ssdd.c \
+	  oslat.c
 
 TARGETS = $(sources:.c=)
 LIBS	= -lrt -lpthread
@@ -48,7 +49,8 @@ MANPAGES = src/cyclictest/cyclictest.8 \
 	   src/sched_deadline/deadline_test.8 \
 	   src/ssdd/ssdd.8 \
 	   src/sched_deadline/cyclicdeadline.8 \
-	   src/cyclictest/get_cyclictest_snapshot.8
+	   src/cyclictest/get_cyclictest_snapshot.8 \
+	   src/oslat/oslat.8
 
 ifdef PYLIB
 	MANPAGES += src/hwlatdetect/hwlatdetect.8
@@ -97,6 +99,7 @@ VPATH	+= src/hackbench:
 VPATH	+= src/sched_deadline:
 VPATH	+= src/queuelat:	
 VPATH	+= src/ssdd:
+VPATH	+= src/oslat:
 
 $(OBJDIR)/%.o: %.c | $(OBJDIR)
 	$(CC) -D VERSION=$(VERSION) -c $< $(CFLAGS) $(CPPFLAGS) -o $@
@@ -164,6 +167,9 @@ queuelat: $(OBJDIR)/queuelat.o $(OBJDIR)/librttest.a
 ssdd: $(OBJDIR)/ssdd.o $(OBJDIR)/librttest.a
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< $(LIBS) $(RTTESTLIB)
 
+oslat: $(OBJDIR)/oslat.o $(OBJDIR)/librttest.a
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< $(LIBS) $(RTTESTLIB) $(NUMA_LIBS)
+
 %.8.gz: %.8
 	gzip -nc $< > $@
 
diff --git a/src/oslat/oslat.8 b/src/oslat/oslat.8
new file mode 100644
index 0000000..83257c6
--- /dev/null
+++ b/src/oslat/oslat.8
@@ -0,0 +1,66 @@
+.TH OSLAT 8 "August 17, 2020"
+.\" for manpage-specific macros, see man(7)
+.SH NAME
+oslat \- OS Latency Detector
+.SH SYNOPSIS
+.SY oslat
+.RI "[ \-shvz ] [ \-b " bucket-size " ] [ \-B " bias " ] [ \-c " cpu-list " ] \
+[ \-C " cpu-main-thread " ] [ \-f " rt-prio " ] [ \-m " workload-mem " ] \
+[\-t " runtime " ] [ \-T " trace-threshold " ] [ \-w " workload " ]"
+.SH DESCRIPTION
+.B oslat
+is an open source userspace polling mode stress program to detect OS level
+latency.  The program runs a busy loop with no or various workload, collecting
+TSC information and measure the time frequently during the process.
+.SH OPTIONS
+.TP
+.B \-b, \-\-bucket-size=N
+Specify the number of the buckets (4-1024).
+.TP
+.B \-B, \-\-bias=USEC
+Add a bias to all the buckets using the estimated mininum.
+.TP
+.B \-c, \-\-cpu-list=CPULIST
+Specify CPUs to run on.  For example, '1,3,5,7-15'.
+.TP
+.B \-C, \-\-cpu-main-thread=CORE
+Specify which CPU the main thread runs on.  Default is cpu0.
+.TP
+.B \-f, \-\-rtprio=PRIORITY
+Using specific SCHED_FIFO priority (1-99).  Otherwise use the default
+priority, normally it will be SCHED_OTHER.
+.TP
+.B \-m, \-\-workload-mem=SIZE
+Size of the memory to use for the workload (e.g., 4K, 1M).
+Total memory usage will be this value multiplies 2*N,
+because there will be src/dst buffers for each thread, and
+N is the number of processors for testing.
+.TP
+.B \-t, \-\-runtime=SEC
+Specify test duration, e.g., 60, 20m, 2H (m/M: minutes, h/H: hours, d/D: days).
+By default the unit is s/second.
+.TP
+.B \-T, \-\-trace-threshold=THRESHOLD
+Stop the test when threshold triggered (in USEC).  At the meantime, print a
+marker in ftrace and stop ftrace too.
+.TP
+.B \-w, \-\-workload=WORKLOAD
+Specify a kind of workload, default is no workload.  Options: "no", "memmove".
+.TP
+.B \-s, \-\-single-preheat
+Use a single thread when measuring latency at preheat stage
+NOTE: please make sure the CPU frequency on all testing cores
+are locked before using this parmater.  If you don't know how
+to lock the freq then please don't use this parameter.
+.TP
+.B \-h, \-\-help
+Show the help message.
+.TP
+.B \-v, \-\-version
+Show the version of the program.
+.TP
+.B \-z, \-\-zero-omit
+Don't display buckets in the output histogram if all zeros.
+.SH AUTHOR
+.B oslat
+was written by Peter Xu <peterx@xxxxxxxxxx>.
diff --git a/src/oslat/oslat.c b/src/oslat/oslat.c
new file mode 100644
index 0000000..d796919
--- /dev/null
+++ b/src/oslat/oslat.c
@@ -0,0 +1,896 @@
+/*
+ * oslat - OS latency detector
+ *
+ * Copyright 2020 Red Hat Inc.
+ *
+ * Authors: Peter Xu <peterx@xxxxxxxxxx>
+ *
+ * Some of the utility code based on sysjitter-1.3:
+ * Copyright 2010-2015 David Riddoch <david@xxxxxxxxxxxxxx>
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of version 3 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sched.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+#include <numa.h>
+#include <math.h>
+#include <limits.h>
+#include <linux/unistd.h>
+
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/utsname.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+#include "rt-utils.h"
+#include "error.h"
+
+#ifdef __GNUC__
+# define atomic_inc(ptr)   __sync_add_and_fetch((ptr), 1)
+# if defined(__x86_64__)
+#  define relax()          __asm__ __volatile__("pause" ::: "memory")
+static inline void frc(uint64_t* pval)
+{
+    uint32_t low, high;
+    /* See rdtsc_ordered() of Linux */
+    __asm__ __volatile__("lfence");
+    __asm__ __volatile__("rdtsc" : "=a" (low) , "=d" (high));
+    *pval = ((uint64_t) high << 32) | low;
+}
+# elif defined(__i386__)
+#  define relax()          __asm__ __volatile__("pause" ::: "memory")
+static inline void frc(uint64_t* pval)
+{
+    __asm__ __volatile__("rdtsc" : "=A" (*pval));
+}
+# elif defined(__PPC64__)
+#  define relax()          do{}while(0)
+static inline void frc(uint64_t* pval)
+{
+    __asm__ __volatile__("mfspr %0, 268\n" : "=r" (*pval));
+}
+# else
+#  error Need frc() for this platform.
+# endif
+#else
+# error Need to add support for this compiler.
+#endif
+
+typedef uint64_t stamp_t;   /* timestamp */
+typedef uint64_t cycles_t;  /* number of cycles */
+typedef unsigned char bool;
+
+#define  true   1
+#define  false  0
+
+enum command {
+    WAIT,
+    GO,
+    STOP
+};
+
+enum workload_type {
+    WORKLOAD_NONE = 0,
+    WORKLOAD_MEMMOVE,
+    WORKLOAD_NUM,
+};
+
+/* This workload needs pre-allocated memory */
+#define  WORK_NEED_MEM  (1UL << 0)
+
+typedef void (*workload_fn)(char *src, char *dst, size_t size);
+
+struct workload {
+    const char *w_name;
+    uint64_t w_flags;
+    workload_fn w_fn;
+};
+
+/* We'll have buckets 1us, 2us, ..., (BUCKET_SIZE) us. */
+#define  BUCKET_SIZE  (32)
+
+/* Default size of the workloads per thread (in bytes, which is 16KB) */
+#define  WORKLOAD_MEM_SIZE  (16UL << 10)
+
+/* By default, no workload */
+#define  WORKLOAD_DEFUALT  WORKLOAD_NONE
+
+struct thread {
+    int                  core_i;
+    pthread_t            thread_id;
+
+    /* NOTE! this is also how many ticks per us */
+    unsigned             cpu_mhz;
+    cycles_t             int_total;
+    stamp_t              frc_start;
+    stamp_t              frc_stop;
+    cycles_t             runtime;
+    stamp_t             *buckets;
+    uint64_t             minlat;
+    /* Maximum latency detected */
+    uint64_t             maxlat;
+    /*
+     * The extra part of the interruptions that cannot be put into even the
+     * biggest bucket.  We'll use this to calculate a more accurate average at
+     * the end of the tests.
+     */
+    uint64_t             overflow_sum;
+    int                  memory_allocated;
+
+    /* Buffers used for the workloads */
+    char *               src_buf;
+    char *               dst_buf;
+
+    /* These variables are calculated after the test */
+    double               average;
+};
+
+struct global {
+    /* Configuration. */
+    unsigned              runtime_secs;
+    /* Number of threads running for current test (either pre heat or real run) */
+    unsigned              n_threads;
+    /* Number of threads to test for the real run */
+    unsigned              n_threads_total;
+    struct timeval        tv_start;
+    int                   rtprio;
+    int                   bucket_size;
+    int                   trace_threshold;
+    int                   runtime;
+    /* The core that we run the main thread.  Default is cpu0 */
+    int                   cpu_main_thread;
+    char *                cpu_list;
+    char *                app_name;
+    struct workload *     workload;
+    uint64_t              workload_mem_size;
+    int                   enable_bias;
+    uint64_t              bias;
+    bool                  single_preheat_thread;
+    bool                  output_omit_zero_buckets;
+
+    /* Mutable state. */
+    volatile enum command cmd;
+    volatile unsigned     n_threads_started;
+    volatile unsigned     n_threads_ready;
+    volatile unsigned     n_threads_running;
+    volatile unsigned     n_threads_finished;
+};
+
+static struct global g;
+
+static void workload_nop(char *dst, char *src, size_t size)
+{
+    /* Nop */
+}
+
+static void workload_memmove(char *dst, char *src, size_t size)
+{
+    memmove(dst, src, size);
+}
+
+struct workload workload_list[WORKLOAD_NUM] = {
+    { "no", 0, workload_nop },
+    { "memmove", WORK_NEED_MEM, workload_memmove },
+};
+
+#define TEST(x)                                 \
+    do {                                        \
+        if( ! (x) )                             \
+            test_fail(#x, __LINE__);            \
+    } while( 0 )
+
+#define TEST0(x)  TEST((x) == 0)
+
+static void test_fail(const char* what, int line)
+{
+    fprintf(stderr, "ERROR:\n");
+    fprintf(stderr, "ERROR: TEST(%s)\n", what);
+    fprintf(stderr, "ERROR: at line %d\n", line);
+    fprintf(stderr, "ERROR: errno=%d (%s)\n", errno, strerror(errno));
+    fprintf(stderr, "ERROR:\n");
+    exit(1);
+}
+
+static int move_to_core(int core_i)
+{
+    cpu_set_t cpus;
+    CPU_ZERO(&cpus);
+    CPU_SET(core_i, &cpus);
+    return sched_setaffinity(0, sizeof(cpus), &cpus);
+}
+
+static cycles_t __measure_cpu_hz(void)
+{
+    struct timeval tvs, tve;
+    stamp_t s, e;
+    double sec;
+
+    frc(&s);
+    e = s;
+    gettimeofday(&tvs, NULL);
+    while( e - s < 1000000 )
+        frc(&e);
+    gettimeofday(&tve, NULL);
+    sec = tve.tv_sec - tvs.tv_sec + (tve.tv_usec - tvs.tv_usec) / 1e6;
+    return (cycles_t) ((e - s) / sec);
+}
+
+static unsigned measure_cpu_mhz(void)
+{
+    cycles_t m, mprev, d;
+
+    mprev = __measure_cpu_hz();
+    do {
+        m = __measure_cpu_hz();
+        if( m > mprev )  d = m - mprev;
+        else             d = mprev - m;
+        mprev = m;
+    } while( d > m / 1000 );
+
+    return (unsigned) (m / 1000000);
+}
+
+static void thread_init(struct thread* t)
+{
+    t->cpu_mhz = measure_cpu_mhz();
+    t->maxlat = 0;
+    t->overflow_sum = 0;
+    t->minlat = (uint64_t)-1;
+
+    /* NOTE: all the buffers are not freed until the process quits. */
+    if (!t->memory_allocated) {
+        TEST(t->buckets = calloc(1, sizeof(t->buckets[0]) * g.bucket_size));
+        if (g.workload->w_flags & WORK_NEED_MEM) {
+            TEST0(posix_memalign((void **)&t->src_buf, getpagesize(),
+                                 g.workload_mem_size));
+            memset(t->src_buf, 0, g.workload_mem_size);
+            TEST0(posix_memalign((void **)&t->dst_buf, getpagesize(),
+                                 g.workload_mem_size));
+            memset(t->dst_buf, 0, g.workload_mem_size);
+        }
+        t->memory_allocated = 1;
+    } else {
+        /* Clear the buckets */
+        memset(t->buckets, 0, sizeof(t->buckets[0]) * g.bucket_size);
+    }
+}
+
+static float cycles_to_sec(const struct thread* t, uint64_t cycles)
+{
+    return cycles / (t->cpu_mhz * 1e6);
+}
+
+static void insert_bucket(struct thread *t, stamp_t value)
+{
+    int index, us;
+    uint64_t extra;
+
+    index = value / t->cpu_mhz;
+    assert(index >= 0);
+    us = index + 1;
+    assert(us > 0);
+
+    if (g.trace_threshold && us >= g.trace_threshold) {
+        char *line = "%s: Trace threshold (%d us) triggered with %u us!  "
+            "Stopping the test.\n";
+        tracemark(line, g.app_name, g.trace_threshold, us);
+        err_quit(line, g.app_name, g.trace_threshold, us);
+    }
+
+    /* Update max latency */
+    if (us > t->maxlat) {
+        t->maxlat = us;
+    }
+
+    if (us < t->minlat) {
+        t->minlat = us;
+    }
+
+    if (g.bias) {
+        /* t->bias will be set after pre-heat if user enabled it */
+        us -= g.bias;
+        /*
+         * Negative should hardly happen, but if it happens, we assume we're in
+         * the smallest bucket, which is 1us.  Same to index.
+         */
+        if (us <= 0) {
+            us = 1;
+        }
+        index -= g.bias;
+        if (index < 0) {
+            index = 0;
+        }
+    }
+
+    /* Too big the jitter; put into the last bucket */
+    if (index >= g.bucket_size) {
+        /* Keep the extra bit (in us) */
+        extra = index - g.bucket_size;
+        if (t->overflow_sum + extra < t->overflow_sum) {
+            /* The uint64_t even overflowed itself; bail out */
+            printf("Accumulated overflow too much!\n");
+            exit(1);
+        }
+        t->overflow_sum += extra;
+        index = g.bucket_size - 1;
+    }
+
+    t->buckets[index]++;
+    if (t->buckets[index] == 0) {
+        printf("Bucket %d overflowed\n", index);
+        exit(1);
+    }
+}
+
+static void doit(struct thread* t)
+{
+    stamp_t ts1, ts2;
+    workload_fn workload_fn = g.workload->w_fn;
+
+    frc(&ts2);
+    do {
+        workload_fn(t->dst_buf, t->src_buf, g.workload_mem_size);
+        frc(&ts1);
+        insert_bucket(t, ts1 - ts2);
+        ts2 = ts1;
+    } while (g.cmd == GO);
+}
+
+static int set_fifo_prio(int prio)
+{
+    struct sched_param param;
+
+    memset(&param, 0, sizeof(param));
+    param.sched_priority = prio;
+    return sched_setscheduler(0, SCHED_FIFO, &param);
+}
+
+static void* thread_main(void* arg)
+{
+    /* Important thing to note here is that once we start bashing the CPU, we
+     * need to keep doing so to prevent the core from changing frequency or
+     * dropping into a low power state.
+     */
+    struct thread* t = arg;
+
+    /* Alloc memory in the thread itself after setting affinity to get the
+     * best chance of getting numa-local memory.  Doesn't matter so much for
+     * the "struct thread" since we expect that to stay cache resident.
+     */
+    TEST(move_to_core(t->core_i) == 0);
+    if (g.rtprio)
+        TEST(set_fifo_prio(g.rtprio) == 0);
+
+    /* Don't bash the cpu until all threads have got going. */
+    atomic_inc(&g.n_threads_started);
+    while( g.cmd == WAIT )
+        usleep(1000);
+
+    thread_init(t);
+
+    /* Ensure we all start at the same time. */
+    atomic_inc(&g.n_threads_running);
+    while( g.n_threads_running != g.n_threads )
+        relax();
+
+    frc(&t->frc_start);
+    doit(t);
+    frc(&t->frc_stop);
+
+    t->runtime = t->frc_stop - t->frc_start;
+
+    /* Wait for everyone to finish so we don't disturb them by exiting and
+     * waking the main thread.
+     */
+    atomic_inc(&g.n_threads_finished);
+    while( g.n_threads_finished != g.n_threads )
+        relax();
+
+    return NULL;
+}
+
+#define putfield(label, val, fmt, end) do {     \
+        printf("%12s:\t", label);               \
+        for (i = 0; i < g.n_threads; ++i)       \
+            printf(" %"fmt, val);               \
+        printf("%s\n", end);                    \
+    } while (0)
+
+void calculate(struct thread *t)
+{
+    int i, j;
+    double sum;
+    uint64_t count;
+
+    for (i = 0; i < g.n_threads; ++i) {
+        /* Calculate average */
+        sum = count = 0;
+        for (j = 0; j < g.bucket_size; j++) {
+            sum += 1.0 * t[i].buckets[j] * (g.bias+j+1);
+            count += t[i].buckets[j];
+        }
+        /* Add the extra amount of huge spikes in */
+        sum += t->overflow_sum;
+        t[i].average = sum / count;
+    }
+}
+
+static void write_summary(struct thread* t)
+{
+    int i, j, k, print_dotdotdot = 0;
+    char bucket_name[64];
+
+    calculate(t);
+
+    putfield("Core", t[i].core_i, "d", "");
+    putfield("CPU Freq", t[i].cpu_mhz, "u", " (Mhz)");
+
+    for (j = 0; j < g.bucket_size; j++) {
+        if (j < g.bucket_size-1 && g.output_omit_zero_buckets) {
+            for (k = 0; k < g.n_threads; k++) {
+                if (t[k].buckets[j] != 0)
+                    break;
+            }
+            if (k == g.n_threads) {
+                print_dotdotdot = 1;
+                continue;
+            }
+        }
+
+        if (print_dotdotdot) {
+            printf("    ...\n");
+            print_dotdotdot = 0;
+        }
+
+        snprintf(bucket_name, sizeof(bucket_name), "%03"PRIu64
+                 " (us)", g.bias+j+1);
+        putfield(bucket_name, t[i].buckets[j], PRIu64,
+                 (j==g.bucket_size-1) ? " (including overflows)" : "");
+    }
+
+    putfield("Minimum", t[i].minlat, PRIu64, " (us)");
+    putfield("Average", t[i].average, ".3lf", " (us)");
+    putfield("Maximum", t[i].maxlat, PRIu64, " (us)");
+    putfield("Max-Min", t[i].maxlat - t[i].minlat, PRIu64, " (us)");
+    putfield("Duration", cycles_to_sec(&(t[i]), t[i].runtime),
+             ".3f", " (sec)");
+    printf("\n");
+}
+
+static void run_expt(struct thread* threads, int runtime_secs)
+{
+    int i;
+
+    g.runtime_secs = runtime_secs;
+    g.n_threads_started = 0;
+    g.n_threads_ready = 0;
+    g.n_threads_running = 0;
+    g.n_threads_finished = 0;
+    g.cmd = WAIT;
+
+    for( i = 0; i < g.n_threads; ++i ) {
+        TEST0(pthread_create(&(threads[i].thread_id), NULL,
+                             thread_main, &(threads[i])));
+    }
+    while( g.n_threads_started != g.n_threads ) {
+        usleep(1000);
+    }
+
+    gettimeofday(&g.tv_start, NULL);
+    g.cmd = GO;
+
+    alarm(runtime_secs);
+
+    /* Go to sleep until the threads have done their stuff. */
+    for( i = 0; i < g.n_threads; ++i ) {
+        pthread_join(threads[i].thread_id, NULL);
+    }
+}
+
+static void handle_alarm(int code)
+{
+    g.cmd = STOP;
+}
+
+const char *helpmsg =
+    "Usage: %s [options]\n"
+    "\n"
+    "This is an OS latency detector by running busy loops on specified cores.\n"
+    "Please run this tool using root.\n"
+    "\n"
+    "Available options:\n"
+    "\n"
+    "  -b, --bucket-size      Specify the number of the buckets (4-1024)\n"
+    "  -B, --bias             Add a bias to all the buckets using the estimated mininum\n"
+    "  -c, --cpu-list         Specify CPUs to run on, e.g. '1,3,5,7-15'\n"
+    "  -C, --cpu-main-thread  Specify which CPU the main thread runs on.  Default is cpu0.\n"
+    "  -f, --rtprio           Using SCHED_FIFO priority (1-99)\n"
+    "  -m, --workload-mem     Size of the memory to use for the workload (e.g., 4K, 1M).\n"
+    "                         Total memory usage will be this value multiplies 2*N,\n"
+    "                         because there will be src/dst buffers for each thread, and\n"
+    "                         N is the number of processors for testing.\n"
+    "  -s, --single-preheat   Use a single thread when measuring latency at preheat stage\n"
+    "                         NOTE: please make sure the CPU frequency on all testing cores\n"
+    "                         are locked before using this parmater.  If you don't know how\n"
+    "                         to lock the freq then please don't use this parameter.\n"
+    "  -t, --runtime          Specify test duration, e.g., 60, 20m, 2H\n"
+    "                         (m/M: minutes, h/H: hours, d/D: days)\n"
+    "  -T, --trace-threshold  Stop the test when threshold triggered (in us),\n"
+    "                         print a marker in ftrace and stop ftrace too.\n"
+    "  -v, --version          Display the version of the software.\n"
+    "  -w, --workload         Specify a kind of workload, default is no workload\n"
+    "                         (options: no, memmove)\n"
+    "  -z, --zero-omit        Don't display buckets in the output histogram if all zeros.\n"
+    "\n"
+    ;
+
+static void usage(void)
+{
+    printf(helpmsg, g.app_name);
+    exit(1);
+}
+
+/* TODO: use libnuma? */
+static int parse_cpu_list(char *cpu_list, cpu_set_t *cpu_set)
+{
+    struct bitmask *cpu_mask;
+    int i, n_cores;
+
+    n_cores = sysconf(_SC_NPROCESSORS_CONF);
+
+    if (!cpu_list) {
+        for (i = 0; i < n_cores; i++)
+            CPU_SET(i, cpu_set);
+        return n_cores;
+    }
+
+    cpu_mask = numa_parse_cpustring_all(cpu_list);
+    if (cpu_mask) {
+        for (i = 0; i < n_cores; i++) {
+            if (numa_bitmask_isbitset(cpu_mask, i)) {
+                CPU_SET(i, cpu_set);
+            }
+        }
+        numa_bitmask_free(cpu_mask);
+    } else {
+        warn("Unknown cpu-list: %s, using all available cpus\n", cpu_list);
+        for (i = 0; i < n_cores; i++)
+            CPU_SET(i, cpu_set);
+    }
+
+    return n_cores;
+}
+
+static int parse_runtime(const char *str)
+{
+    char *endptr;
+    int v = strtol(str, &endptr, 10);
+
+    if (!*endptr) {
+        return v;
+    }
+
+    switch (*endptr) {
+    case 'd':
+    case 'D':
+        /* Days */
+        v *= 24;
+    case 'h':
+    case 'H':
+        /* Hours */
+        v *= 60;
+    case 'm':
+    case 'M':
+        /* Minutes */
+        v *= 60;
+    case 's':
+    case 'S':
+        /* Seconds */
+        break;
+    default:
+        printf("Unknown runtime suffix: %s\n", endptr);
+        v = 0;
+        break;
+    }
+
+    return v;
+}
+
+static int parse_mem_size(char *str, uint64_t *val)
+{
+    char *endptr;
+    int v = strtol(str, &endptr, 10);
+
+    if (!*endptr) {
+        return v;
+    }
+
+    switch (*endptr) {
+    case 'g':
+    case 'G':
+        v *= 1024;
+    case 'm':
+    case 'M':
+        v *= 1024;
+    case 'k':
+    case 'K':
+        v *= 1024;
+    case 'b':
+    case 'B':
+        break;
+    default:
+        return -1;
+    }
+
+    *val = v;
+
+    return 0;
+}
+
+static int workload_select(char *name)
+{
+    int i = 0;
+
+    for (i = 0; i < WORKLOAD_NUM; i++) {
+        if (!strcmp(name, workload_list[i].w_name)) {
+            g.workload = &workload_list[i];
+            return 0;
+        }
+    }
+
+    return -1;
+}
+
+/* Process commandline options */
+static void parse_options(int argc, char *argv[])
+{
+    while (1) {
+        static struct option options[] = {
+            { "bucket-size", required_argument, NULL, 'b' },
+            { "cpu-list", required_argument, NULL, 'c' },
+            { "cpu-main-thread", required_argument, NULL, 'C'},
+            { "runtime", required_argument, NULL, 't' },
+            { "rtprio", required_argument, NULL, 'f' },
+            { "help", no_argument, NULL, 'h' },
+            { "trace-threshold", required_argument, NULL, 'T' },
+            { "workload", required_argument, NULL, 'w'},
+            { "workload-mem", required_argument, NULL, 'm'},
+            { "bias", no_argument, NULL, 'B'},
+            { "single-preheat", no_argument, NULL, 's'},
+            { "zero-omit", no_argument, NULL, 'u'},
+            { "version", no_argument, NULL, 'v'},
+            { NULL, 0, NULL, 0 },
+        };
+        int i, c = getopt_long(argc, argv, "b:Bc:C:f:hm:st:w:T:vz",
+                               options, NULL);
+        long ncores;
+
+        if (c == -1)
+            break;
+
+        switch (c) {
+        case 'b':
+            g.bucket_size = strtol(optarg, NULL, 10);
+            if (g.bucket_size > 1024 || g.bucket_size <= 4) {
+                printf("Illegal bucket size: %s (should be: 4-1024)\n",
+                       optarg);
+                exit(1);
+            }
+            break;
+        case 'B':
+            g.enable_bias = 1;
+            break;
+        case 'c':
+            g.cpu_list = strdup(optarg);
+            break;
+        case 'C':
+            ncores = sysconf(_SC_NPROCESSORS_CONF);
+            g.cpu_main_thread = strtol(optarg, NULL, 10);
+            if (g.cpu_main_thread < 0 || g.cpu_main_thread > ncores) {
+                printf("Illegal core for main thread: %s (should be: 0-%ld)\n",
+                       optarg, ncores);
+                exit(1);
+            }
+            break;
+        case 't':
+            g.runtime = parse_runtime(optarg);
+            if (!g.runtime) {
+                printf("Illegal runtime: %s\n", optarg);
+                exit(1);
+            }
+            break;
+        case 'f':
+            g.rtprio = strtol(optarg, NULL, 10);
+            if (g.rtprio < 1 || g.rtprio > 99) {
+                printf("Illegal RT priority: %s (should be: 1-99)\n", optarg);
+                exit(1);
+            }
+            break;
+        case 'T':
+            g.trace_threshold = strtol(optarg, NULL, 10);
+            if (g.trace_threshold <= 0) {
+                printf("Parameter --trace-threshold needs to be positive\n");
+                exit(1);
+            }
+            enable_trace_mark();
+            break;
+        case 'w':
+            if (workload_select(optarg)) {
+                printf("Unknown workload '%s'.  Please choose from: ", optarg);
+                for (i = 0; i < WORKLOAD_NUM; i++) {
+                    printf("'%s'", workload_list[i].w_name);
+                    if (i != WORKLOAD_NUM - 1) {
+                        printf(", ");
+                    }
+                }
+                printf("\n\n");
+                exit(1);
+            }
+            break;
+        case 'm':
+            if (parse_mem_size(optarg, &g.workload_mem_size)) {
+                printf("Unknown workload memory size '%s'.\n\n", optarg);
+                exit(1);
+            }
+            break;
+        case 's':
+            /*
+             * Only use one core for pre-heat.  Then if --bias is used, the
+             * bias will be exactly the min value of the pre-heat core.
+             */
+            g.single_preheat_thread = true;
+            break;
+        case 'v':
+            /*
+             * Because we always dump the version even before parsing options,
+             * what we need to do is to quit..
+             */
+            exit(0);
+            break;
+        case 'z':
+            g.output_omit_zero_buckets = 1;
+            break;
+        default:
+            usage();
+            break;
+        }
+    }
+}
+
+void dump_globals(void)
+{
+    printf("Total runtime: \t\t%d seconds\n", g.runtime);
+    printf("Thread priority: \t");
+    if (g.rtprio) {
+        printf("SCHED_FIFO:%d\n", g.rtprio);
+    } else {
+        printf("default\n");
+    }
+    printf("CPU list: \t\t%s\n", g.cpu_list ?: "(all cores)");
+    printf("CPU for main thread: \t%d\n", g.cpu_main_thread);
+    printf("Workload: \t\t%s\n", g.workload->w_name);
+    printf("Workload mem: \t\t%"PRIu64" (KiB)\n",
+           (g.workload->w_flags & WORK_NEED_MEM) ?
+           (g.workload_mem_size / 1024) : 0);
+    printf("Preheat cores: \t\t%d\n", g.single_preheat_thread ?
+           1 : g.n_threads_total);
+    printf("\n");
+}
+
+static void record_bias(struct thread *t)
+{
+    int i;
+    uint64_t bias = (uint64_t)-1;
+
+    if (!g.enable_bias) {
+        return;
+    }
+
+    /* Record the min value of minlat on all the threads */
+    for( i = 0; i < g.n_threads; ++i ) {
+        if (t[i].minlat < bias) {
+            bias = t[i].minlat;
+        }
+    }
+    g.bias = bias;
+    printf("Global bias set to %" PRId64 " (us)\n", bias);
+}
+
+int main(int argc, char* argv[])
+{
+    struct thread* threads;
+    int i, n_cores;
+    cpu_set_t cpu_set;
+
+    CPU_ZERO(&cpu_set);
+
+    g.app_name = argv[0];
+    g.rtprio = 0;
+    g.bucket_size = BUCKET_SIZE;
+    g.runtime = 1;
+    g.workload = &workload_list[WORKLOAD_DEFUALT];
+    g.workload_mem_size = WORKLOAD_MEM_SIZE;
+    /* Run the main thread on cpu0 by default */
+    g.cpu_main_thread = 0;
+
+    printf("\nVersion: %1.2f\n\n", VERSION);
+
+    parse_options(argc, argv);
+
+    TEST(mlockall(MCL_CURRENT | MCL_FUTURE) == 0);
+
+    n_cores = parse_cpu_list(g.cpu_list, &cpu_set);
+
+    TEST( threads = calloc(1, CPU_COUNT(&cpu_set) * sizeof(threads[0])) );
+    for( i = 0; i < n_cores; ++i )
+        if (CPU_ISSET(i, &cpu_set) && move_to_core(i) == 0)
+            threads[g.n_threads_total++].core_i = i;
+
+    if (CPU_ISSET(0, &cpu_set) && g.rtprio) {
+        printf("WARNING: Running SCHED_FIFO workload on CPU 0 "
+               "may hang the main thread\n");
+    }
+
+    TEST(move_to_core(g.cpu_main_thread) == 0);
+
+    signal(SIGALRM, handle_alarm);
+    signal(SIGINT, handle_alarm);
+    signal(SIGTERM, handle_alarm);
+
+    dump_globals();
+
+    printf("Pre-heat for 1 seconds...\n");
+    if (g.single_preheat_thread) {
+        g.n_threads = 1;
+    } else {
+        g.n_threads = g.n_threads_total;
+    }
+    run_expt(threads, 1);
+    record_bias(threads);
+
+    printf("Test starts...\n");
+    /* Reset n_threads to always run on all the cores */
+    g.n_threads = g.n_threads_total;
+    run_expt(threads, g.runtime);
+
+    printf("Test completed.\n\n");
+
+    write_summary(threads);
+
+    if (g.cpu_list) {
+        free(g.cpu_list);
+        g.cpu_list = NULL;
+    }
+
+    return 0;
+}
-- 
2.26.2