Re: [PATCH 2/2] rt-tests: oslat: Init commit

John Kacur <jkacur@xxxxxxxxxx> · Tue, 18 Aug 2020 16:54:02 -0400 (EDT)

On Mon, 17 Aug 2020, Peter Xu wrote:

> oslat was initially a standalone program [1].  This patch merges oslat into
> rt-tests repo.
> 
> This is a direct port of oslat v0.1.7 into rt-tests.  It naturally bumps the
> version to latest rt-tests version.
> 
> [1] https://github.com/xzpeter/oslat
> 
> Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
> ---
>  .gitignore        |   1 +
>  Makefile          |  10 +-
>  src/oslat/oslat.8 |  66 ++++
>  src/oslat/oslat.c | 896 ++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 971 insertions(+), 2 deletions(-)
>  create mode 100644 src/oslat/oslat.8
>  create mode 100644 src/oslat/oslat.c
> 
> diff --git a/.gitignore b/.gitignore
> index bc01575..a975c4b 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -39,6 +39,7 @@ SRPMS
>  /queuelat
>  /ssdd
>  /get_cyclictest_snapshot
> +/oslat
>  
>  rt-tests.spec
>  tags
> diff --git a/Makefile b/Makefile
> index be78312..3f59efb 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -17,7 +17,8 @@ sources = cyclictest.c \
>  	  cyclicdeadline.c \
>  	  deadline_test.c \
>  	  queuelat.c \
> -	  ssdd.c
> +	  ssdd.c \
> +	  oslat.c
>  
>  TARGETS = $(sources:.c=)
>  LIBS	= -lrt -lpthread
> @@ -48,7 +49,8 @@ MANPAGES = src/cyclictest/cyclictest.8 \
>  	   src/sched_deadline/deadline_test.8 \
>  	   src/ssdd/ssdd.8 \
>  	   src/sched_deadline/cyclicdeadline.8 \
> -	   src/cyclictest/get_cyclictest_snapshot.8
> +	   src/cyclictest/get_cyclictest_snapshot.8 \
> +	   src/oslat/oslat.8
>  
>  ifdef PYLIB
>  	MANPAGES += src/hwlatdetect/hwlatdetect.8
> @@ -97,6 +99,7 @@ VPATH	+= src/hackbench:
>  VPATH	+= src/sched_deadline:
>  VPATH	+= src/queuelat:	
>  VPATH	+= src/ssdd:
> +VPATH	+= src/oslat:
>  
>  $(OBJDIR)/%.o: %.c | $(OBJDIR)
>  	$(CC) -D VERSION=$(VERSION) -c $< $(CFLAGS) $(CPPFLAGS) -o $@
> @@ -164,6 +167,9 @@ queuelat: $(OBJDIR)/queuelat.o $(OBJDIR)/librttest.a
>  ssdd: $(OBJDIR)/ssdd.o $(OBJDIR)/librttest.a
>  	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< $(LIBS) $(RTTESTLIB)
>  
> +oslat: $(OBJDIR)/oslat.o $(OBJDIR)/librttest.a
> +	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< $(LIBS) $(RTTESTLIB) $(NUMA_LIBS)
> +
>  %.8.gz: %.8
>  	gzip -nc $< > $@
>  
> diff --git a/src/oslat/oslat.8 b/src/oslat/oslat.8
> new file mode 100644
> index 0000000..83257c6
> --- /dev/null
> +++ b/src/oslat/oslat.8
> @@ -0,0 +1,66 @@
> +.TH OSLAT 8 "August 17, 2020"
> +.\" for manpage-specific macros, see man(7)
> +.SH NAME
> +oslat \- OS Latency Detector
> +.SH SYNOPSIS
> +.SY oslat
> +.RI "[ \-shvz ] [ \-b " bucket-size " ] [ \-B " bias " ] [ \-c " cpu-list " ] \
> +[ \-C " cpu-main-thread " ] [ \-f " rt-prio " ] [ \-m " workload-mem " ] \
> +[\-t " runtime " ] [ \-T " trace-threshold " ] [ \-w " workload " ]"
> +.SH DESCRIPTION
> +.B oslat
> +is an open source userspace polling mode stress program to detect OS level
> +latency.  The program runs a busy loop with no or various workload, collecting
> +TSC information and measure the time frequently during the process.
> +.SH OPTIONS
> +.TP
> +.B \-b, \-\-bucket-size=N
> +Specify the number of the buckets (4-1024).
> +.TP
> +.B \-B, \-\-bias=USEC
> +Add a bias to all the buckets using the estimated mininum.
> +.TP
> +.B \-c, \-\-cpu-list=CPULIST
> +Specify CPUs to run on.  For example, '1,3,5,7-15'.
> +.TP
> +.B \-C, \-\-cpu-main-thread=CORE
> +Specify which CPU the main thread runs on.  Default is cpu0.
> +.TP
> +.B \-f, \-\-rtprio=PRIORITY
> +Using specific SCHED_FIFO priority (1-99).  Otherwise use the default
> +priority, normally it will be SCHED_OTHER.
> +.TP
> +.B \-m, \-\-workload-mem=SIZE
> +Size of the memory to use for the workload (e.g., 4K, 1M).
> +Total memory usage will be this value multiplies 2*N,
> +because there will be src/dst buffers for each thread, and
> +N is the number of processors for testing.
> +.TP
> +.B \-t, \-\-runtime=SEC
> +Specify test duration, e.g., 60, 20m, 2H (m/M: minutes, h/H: hours, d/D: days).
> +By default the unit is s/second.
> +.TP
> +.B \-T, \-\-trace-threshold=THRESHOLD
> +Stop the test when threshold triggered (in USEC).  At the meantime, print a
> +marker in ftrace and stop ftrace too.
> +.TP
> +.B \-w, \-\-workload=WORKLOAD
> +Specify a kind of workload, default is no workload.  Options: "no", "memmove".
> +.TP
> +.B \-s, \-\-single-preheat
> +Use a single thread when measuring latency at preheat stage
> +NOTE: please make sure the CPU frequency on all testing cores
> +are locked before using this parmater.  If you don't know how
> +to lock the freq then please don't use this parameter.
> +.TP
> +.B \-h, \-\-help
> +Show the help message.
> +.TP
> +.B \-v, \-\-version
> +Show the version of the program.
> +.TP
> +.B \-z, \-\-zero-omit
> +Don't display buckets in the output histogram if all zeros.
> +.SH AUTHOR
> +.B oslat
> +was written by Peter Xu <peterx@xxxxxxxxxx>.
> diff --git a/src/oslat/oslat.c b/src/oslat/oslat.c
> new file mode 100644
> index 0000000..d796919
> --- /dev/null
> +++ b/src/oslat/oslat.c
> @@ -0,0 +1,896 @@
> +/*
> + * oslat - OS latency detector
> + *
> + * Copyright 2020 Red Hat Inc.
> + *
> + * Authors: Peter Xu <peterx@xxxxxxxxxx>
> + *
> + * Some of the utility code based on sysjitter-1.3:
> + * Copyright 2010-2015 David Riddoch <david@xxxxxxxxxxxxxx>
> + *
> + * This program is free software: you can redistribute it and/or modify it
> + * under the terms of version 3 of the GNU General Public License as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <assert.h>
> +#include <inttypes.h>
> +#include <ctype.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <stdarg.h>
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <getopt.h>
> +#include <pthread.h>
> +#include <signal.h>
> +#include <sched.h>
> +#include <string.h>
> +#include <time.h>
> +#include <errno.h>
> +#include <numa.h>
> +#include <math.h>
> +#include <limits.h>
> +#include <linux/unistd.h>
> +
> +#include <sys/prctl.h>
> +#include <sys/stat.h>
> +#include <sys/sysinfo.h>
> +#include <sys/types.h>
> +#include <sys/time.h>
> +#include <sys/resource.h>
> +#include <sys/utsname.h>
> +#include <sys/mman.h>
> +#include <sys/syscall.h>
> +
> +#include "rt-utils.h"
> +#include "error.h"
> +
> +#ifdef __GNUC__
> +# define atomic_inc(ptr)   __sync_add_and_fetch((ptr), 1)
> +# if defined(__x86_64__)
> +#  define relax()          __asm__ __volatile__("pause" ::: "memory")
> +static inline void frc(uint64_t* pval)
> +{
> +    uint32_t low, high;
> +    /* See rdtsc_ordered() of Linux */
> +    __asm__ __volatile__("lfence");
> +    __asm__ __volatile__("rdtsc" : "=a" (low) , "=d" (high));
> +    *pval = ((uint64_t) high << 32) | low;
> +}
> +# elif defined(__i386__)
> +#  define relax()          __asm__ __volatile__("pause" ::: "memory")
> +static inline void frc(uint64_t* pval)
> +{
> +    __asm__ __volatile__("rdtsc" : "=A" (*pval));
> +}
> +# elif defined(__PPC64__)
> +#  define relax()          do{}while(0)
> +static inline void frc(uint64_t* pval)
> +{
> +    __asm__ __volatile__("mfspr %0, 268\n" : "=r" (*pval));
> +}
> +# else
> +#  error Need frc() for this platform.
> +# endif
> +#else
> +# error Need to add support for this compiler.
> +#endif
> +
> +typedef uint64_t stamp_t;   /* timestamp */
> +typedef uint64_t cycles_t;  /* number of cycles */
> +typedef unsigned char bool;
> +
> +#define  true   1
> +#define  false  0
> +
> +enum command {
> +    WAIT,
> +    GO,
> +    STOP
> +};
> +
> +enum workload_type {
> +    WORKLOAD_NONE = 0,
> +    WORKLOAD_MEMMOVE,
> +    WORKLOAD_NUM,
> +};
> +
> +/* This workload needs pre-allocated memory */
> +#define  WORK_NEED_MEM  (1UL << 0)
> +
> +typedef void (*workload_fn)(char *src, char *dst, size_t size);
> +
> +struct workload {
> +    const char *w_name;
> +    uint64_t w_flags;
> +    workload_fn w_fn;
> +};
> +
> +/* We'll have buckets 1us, 2us, ..., (BUCKET_SIZE) us. */
> +#define  BUCKET_SIZE  (32)
> +
> +/* Default size of the workloads per thread (in bytes, which is 16KB) */
> +#define  WORKLOAD_MEM_SIZE  (16UL << 10)
> +
> +/* By default, no workload */
> +#define  WORKLOAD_DEFUALT  WORKLOAD_NONE
> +
> +struct thread {
> +    int                  core_i;
> +    pthread_t            thread_id;
> +
> +    /* NOTE! this is also how many ticks per us */
> +    unsigned             cpu_mhz;
> +    cycles_t             int_total;
> +    stamp_t              frc_start;
> +    stamp_t              frc_stop;
> +    cycles_t             runtime;
> +    stamp_t             *buckets;
> +    uint64_t             minlat;
> +    /* Maximum latency detected */
> +    uint64_t             maxlat;
> +    /*
> +     * The extra part of the interruptions that cannot be put into even the
> +     * biggest bucket.  We'll use this to calculate a more accurate average at
> +     * the end of the tests.
> +     */
> +    uint64_t             overflow_sum;
> +    int                  memory_allocated;
> +
> +    /* Buffers used for the workloads */
> +    char *               src_buf;
> +    char *               dst_buf;
> +
> +    /* These variables are calculated after the test */
> +    double               average;
> +};
> +
> +struct global {
> +    /* Configuration. */
> +    unsigned              runtime_secs;
> +    /* Number of threads running for current test (either pre heat or real run) */
> +    unsigned              n_threads;
> +    /* Number of threads to test for the real run */
> +    unsigned              n_threads_total;
> +    struct timeval        tv_start;
> +    int                   rtprio;
> +    int                   bucket_size;
> +    int                   trace_threshold;
> +    int                   runtime;
> +    /* The core that we run the main thread.  Default is cpu0 */
> +    int                   cpu_main_thread;
> +    char *                cpu_list;
> +    char *                app_name;
> +    struct workload *     workload;
> +    uint64_t              workload_mem_size;
> +    int                   enable_bias;
> +    uint64_t              bias;
> +    bool                  single_preheat_thread;
> +    bool                  output_omit_zero_buckets;
> +
> +    /* Mutable state. */
> +    volatile enum command cmd;
> +    volatile unsigned     n_threads_started;
> +    volatile unsigned     n_threads_ready;
> +    volatile unsigned     n_threads_running;
> +    volatile unsigned     n_threads_finished;
> +};
> +
> +static struct global g;
> +
> +static void workload_nop(char *dst, char *src, size_t size)
> +{
> +    /* Nop */
> +}
> +
> +static void workload_memmove(char *dst, char *src, size_t size)
> +{
> +    memmove(dst, src, size);
> +}
> +
> +struct workload workload_list[WORKLOAD_NUM] = {
> +    { "no", 0, workload_nop },
> +    { "memmove", WORK_NEED_MEM, workload_memmove },
> +};
> +
> +#define TEST(x)                                 \
> +    do {                                        \
> +        if( ! (x) )                             \
> +            test_fail(#x, __LINE__);            \
> +    } while( 0 )
> +
> +#define TEST0(x)  TEST((x) == 0)
> +
> +static void test_fail(const char* what, int line)
> +{
> +    fprintf(stderr, "ERROR:\n");
> +    fprintf(stderr, "ERROR: TEST(%s)\n", what);
> +    fprintf(stderr, "ERROR: at line %d\n", line);
> +    fprintf(stderr, "ERROR: errno=%d (%s)\n", errno, strerror(errno));
> +    fprintf(stderr, "ERROR:\n");
> +    exit(1);
> +}
> +
> +static int move_to_core(int core_i)
> +{
> +    cpu_set_t cpus;
> +    CPU_ZERO(&cpus);
> +    CPU_SET(core_i, &cpus);
> +    return sched_setaffinity(0, sizeof(cpus), &cpus);
> +}
> +
> +static cycles_t __measure_cpu_hz(void)
> +{
> +    struct timeval tvs, tve;
> +    stamp_t s, e;
> +    double sec;
> +
> +    frc(&s);
> +    e = s;
> +    gettimeofday(&tvs, NULL);
> +    while( e - s < 1000000 )
> +        frc(&e);
> +    gettimeofday(&tve, NULL);
> +    sec = tve.tv_sec - tvs.tv_sec + (tve.tv_usec - tvs.tv_usec) / 1e6;
> +    return (cycles_t) ((e - s) / sec);
> +}
> +
> +static unsigned measure_cpu_mhz(void)
> +{
> +    cycles_t m, mprev, d;
> +
> +    mprev = __measure_cpu_hz();
> +    do {
> +        m = __measure_cpu_hz();
> +        if( m > mprev )  d = m - mprev;
> +        else             d = mprev - m;
> +        mprev = m;
> +    } while( d > m / 1000 );
> +
> +    return (unsigned) (m / 1000000);
> +}
> +
> +static void thread_init(struct thread* t)
> +{
> +    t->cpu_mhz = measure_cpu_mhz();
> +    t->maxlat = 0;
> +    t->overflow_sum = 0;
> +    t->minlat = (uint64_t)-1;
> +
> +    /* NOTE: all the buffers are not freed until the process quits. */
> +    if (!t->memory_allocated) {
> +        TEST(t->buckets = calloc(1, sizeof(t->buckets[0]) * g.bucket_size));
> +        if (g.workload->w_flags & WORK_NEED_MEM) {
> +            TEST0(posix_memalign((void **)&t->src_buf, getpagesize(),
> +                                 g.workload_mem_size));
> +            memset(t->src_buf, 0, g.workload_mem_size);
> +            TEST0(posix_memalign((void **)&t->dst_buf, getpagesize(),
> +                                 g.workload_mem_size));
> +            memset(t->dst_buf, 0, g.workload_mem_size);
> +        }
> +        t->memory_allocated = 1;
> +    } else {
> +        /* Clear the buckets */
> +        memset(t->buckets, 0, sizeof(t->buckets[0]) * g.bucket_size);
> +    }
> +}
> +
> +static float cycles_to_sec(const struct thread* t, uint64_t cycles)
> +{
> +    return cycles / (t->cpu_mhz * 1e6);
> +}
> +
> +static void insert_bucket(struct thread *t, stamp_t value)
> +{
> +    int index, us;
> +    uint64_t extra;
> +
> +    index = value / t->cpu_mhz;
> +    assert(index >= 0);
> +    us = index + 1;
> +    assert(us > 0);
> +
> +    if (g.trace_threshold && us >= g.trace_threshold) {
> +        char *line = "%s: Trace threshold (%d us) triggered with %u us!  "
> +            "Stopping the test.\n";
> +        tracemark(line, g.app_name, g.trace_threshold, us);
> +        err_quit(line, g.app_name, g.trace_threshold, us);
> +    }
> +
> +    /* Update max latency */
> +    if (us > t->maxlat) {
> +        t->maxlat = us;
> +    }
> +
> +    if (us < t->minlat) {
> +        t->minlat = us;
> +    }
> +
> +    if (g.bias) {
> +        /* t->bias will be set after pre-heat if user enabled it */
> +        us -= g.bias;
> +        /*
> +         * Negative should hardly happen, but if it happens, we assume we're in
> +         * the smallest bucket, which is 1us.  Same to index.
> +         */
> +        if (us <= 0) {
> +            us = 1;
> +        }
> +        index -= g.bias;
> +        if (index < 0) {
> +            index = 0;
> +        }
> +    }
> +
> +    /* Too big the jitter; put into the last bucket */
> +    if (index >= g.bucket_size) {
> +        /* Keep the extra bit (in us) */
> +        extra = index - g.bucket_size;
> +        if (t->overflow_sum + extra < t->overflow_sum) {
> +            /* The uint64_t even overflowed itself; bail out */
> +            printf("Accumulated overflow too much!\n");
> +            exit(1);
> +        }
> +        t->overflow_sum += extra;
> +        index = g.bucket_size - 1;
> +    }
> +
> +    t->buckets[index]++;
> +    if (t->buckets[index] == 0) {
> +        printf("Bucket %d overflowed\n", index);
> +        exit(1);
> +    }
> +}
> +
> +static void doit(struct thread* t)
> +{
> +    stamp_t ts1, ts2;
> +    workload_fn workload_fn = g.workload->w_fn;
> +
> +    frc(&ts2);
> +    do {
> +        workload_fn(t->dst_buf, t->src_buf, g.workload_mem_size);
> +        frc(&ts1);
> +        insert_bucket(t, ts1 - ts2);
> +        ts2 = ts1;
> +    } while (g.cmd == GO);
> +}
> +
> +static int set_fifo_prio(int prio)
> +{
> +    struct sched_param param;
> +
> +    memset(&param, 0, sizeof(param));
> +    param.sched_priority = prio;
> +    return sched_setscheduler(0, SCHED_FIFO, &param);
> +}
> +
> +static void* thread_main(void* arg)
> +{
> +    /* Important thing to note here is that once we start bashing the CPU, we
> +     * need to keep doing so to prevent the core from changing frequency or
> +     * dropping into a low power state.
> +     */
> +    struct thread* t = arg;
> +
> +    /* Alloc memory in the thread itself after setting affinity to get the
> +     * best chance of getting numa-local memory.  Doesn't matter so much for
> +     * the "struct thread" since we expect that to stay cache resident.
> +     */
> +    TEST(move_to_core(t->core_i) == 0);
> +    if (g.rtprio)
> +        TEST(set_fifo_prio(g.rtprio) == 0);
> +
> +    /* Don't bash the cpu until all threads have got going. */
> +    atomic_inc(&g.n_threads_started);
> +    while( g.cmd == WAIT )
> +        usleep(1000);
> +
> +    thread_init(t);
> +
> +    /* Ensure we all start at the same time. */
> +    atomic_inc(&g.n_threads_running);
> +    while( g.n_threads_running != g.n_threads )
> +        relax();
> +
> +    frc(&t->frc_start);
> +    doit(t);
> +    frc(&t->frc_stop);
> +
> +    t->runtime = t->frc_stop - t->frc_start;
> +
> +    /* Wait for everyone to finish so we don't disturb them by exiting and
> +     * waking the main thread.
> +     */
> +    atomic_inc(&g.n_threads_finished);
> +    while( g.n_threads_finished != g.n_threads )
> +        relax();
> +
> +    return NULL;
> +}
> +
> +#define putfield(label, val, fmt, end) do {     \
> +        printf("%12s:\t", label);               \
> +        for (i = 0; i < g.n_threads; ++i)       \
> +            printf(" %"fmt, val);               \
> +        printf("%s\n", end);                    \
> +    } while (0)
> +
> +void calculate(struct thread *t)
> +{
> +    int i, j;
> +    double sum;
> +    uint64_t count;
> +
> +    for (i = 0; i < g.n_threads; ++i) {
> +        /* Calculate average */
> +        sum = count = 0;
> +        for (j = 0; j < g.bucket_size; j++) {
> +            sum += 1.0 * t[i].buckets[j] * (g.bias+j+1);
> +            count += t[i].buckets[j];
> +        }
> +        /* Add the extra amount of huge spikes in */
> +        sum += t->overflow_sum;
> +        t[i].average = sum / count;
> +    }
> +}
> +
> +static void write_summary(struct thread* t)
> +{
> +    int i, j, k, print_dotdotdot = 0;
> +    char bucket_name[64];
> +
> +    calculate(t);
> +
> +    putfield("Core", t[i].core_i, "d", "");
> +    putfield("CPU Freq", t[i].cpu_mhz, "u", " (Mhz)");
> +
> +    for (j = 0; j < g.bucket_size; j++) {
> +        if (j < g.bucket_size-1 && g.output_omit_zero_buckets) {
> +            for (k = 0; k < g.n_threads; k++) {
> +                if (t[k].buckets[j] != 0)
> +                    break;
> +            }
> +            if (k == g.n_threads) {
> +                print_dotdotdot = 1;
> +                continue;
> +            }
> +        }
> +
> +        if (print_dotdotdot) {
> +            printf("    ...\n");
> +            print_dotdotdot = 0;
> +        }
> +
> +        snprintf(bucket_name, sizeof(bucket_name), "%03"PRIu64
> +                 " (us)", g.bias+j+1);
> +        putfield(bucket_name, t[i].buckets[j], PRIu64,
> +                 (j==g.bucket_size-1) ? " (including overflows)" : "");
> +    }
> +
> +    putfield("Minimum", t[i].minlat, PRIu64, " (us)");
> +    putfield("Average", t[i].average, ".3lf", " (us)");
> +    putfield("Maximum", t[i].maxlat, PRIu64, " (us)");
> +    putfield("Max-Min", t[i].maxlat - t[i].minlat, PRIu64, " (us)");
> +    putfield("Duration", cycles_to_sec(&(t[i]), t[i].runtime),
> +             ".3f", " (sec)");
> +    printf("\n");
> +}
> +
> +static void run_expt(struct thread* threads, int runtime_secs)
> +{
> +    int i;
> +
> +    g.runtime_secs = runtime_secs;
> +    g.n_threads_started = 0;
> +    g.n_threads_ready = 0;
> +    g.n_threads_running = 0;
> +    g.n_threads_finished = 0;
> +    g.cmd = WAIT;
> +
> +    for( i = 0; i < g.n_threads; ++i ) {
> +        TEST0(pthread_create(&(threads[i].thread_id), NULL,
> +                             thread_main, &(threads[i])));
> +    }
> +    while( g.n_threads_started != g.n_threads ) {
> +        usleep(1000);
> +    }
> +
> +    gettimeofday(&g.tv_start, NULL);
> +    g.cmd = GO;
> +
> +    alarm(runtime_secs);
> +
> +    /* Go to sleep until the threads have done their stuff. */
> +    for( i = 0; i < g.n_threads; ++i ) {
> +        pthread_join(threads[i].thread_id, NULL);
> +    }
> +}
> +
> +static void handle_alarm(int code)
> +{
> +    g.cmd = STOP;
> +}
> +
> +const char *helpmsg =
> +    "Usage: %s [options]\n"
> +    "\n"
> +    "This is an OS latency detector by running busy loops on specified cores.\n"
> +    "Please run this tool using root.\n"
> +    "\n"
> +    "Available options:\n"
> +    "\n"
> +    "  -b, --bucket-size      Specify the number of the buckets (4-1024)\n"
> +    "  -B, --bias             Add a bias to all the buckets using the estimated mininum\n"
> +    "  -c, --cpu-list         Specify CPUs to run on, e.g. '1,3,5,7-15'\n"
> +    "  -C, --cpu-main-thread  Specify which CPU the main thread runs on.  Default is cpu0.\n"
> +    "  -f, --rtprio           Using SCHED_FIFO priority (1-99)\n"
> +    "  -m, --workload-mem     Size of the memory to use for the workload (e.g., 4K, 1M).\n"
> +    "                         Total memory usage will be this value multiplies 2*N,\n"
> +    "                         because there will be src/dst buffers for each thread, and\n"
> +    "                         N is the number of processors for testing.\n"
> +    "  -s, --single-preheat   Use a single thread when measuring latency at preheat stage\n"
> +    "                         NOTE: please make sure the CPU frequency on all testing cores\n"
> +    "                         are locked before using this parmater.  If you don't know how\n"
> +    "                         to lock the freq then please don't use this parameter.\n"
> +    "  -t, --runtime          Specify test duration, e.g., 60, 20m, 2H\n"
> +    "                         (m/M: minutes, h/H: hours, d/D: days)\n"
> +    "  -T, --trace-threshold  Stop the test when threshold triggered (in us),\n"
> +    "                         print a marker in ftrace and stop ftrace too.\n"
> +    "  -v, --version          Display the version of the software.\n"
> +    "  -w, --workload         Specify a kind of workload, default is no workload\n"
> +    "                         (options: no, memmove)\n"
> +    "  -z, --zero-omit        Don't display buckets in the output histogram if all zeros.\n"
> +    "\n"
> +    ;
> +
> +static void usage(void)
> +{
> +    printf(helpmsg, g.app_name);
> +    exit(1);
> +}
> +
> +/* TODO: use libnuma? */
> +static int parse_cpu_list(char *cpu_list, cpu_set_t *cpu_set)
> +{
> +    struct bitmask *cpu_mask;
> +    int i, n_cores;
> +
> +    n_cores = sysconf(_SC_NPROCESSORS_CONF);
> +
> +    if (!cpu_list) {
> +        for (i = 0; i < n_cores; i++)
> +            CPU_SET(i, cpu_set);
> +        return n_cores;
> +    }
> +
> +    cpu_mask = numa_parse_cpustring_all(cpu_list);
> +    if (cpu_mask) {
> +        for (i = 0; i < n_cores; i++) {
> +            if (numa_bitmask_isbitset(cpu_mask, i)) {
> +                CPU_SET(i, cpu_set);
> +            }
> +        }
> +        numa_bitmask_free(cpu_mask);
> +    } else {
> +        warn("Unknown cpu-list: %s, using all available cpus\n", cpu_list);
> +        for (i = 0; i < n_cores; i++)
> +            CPU_SET(i, cpu_set);
> +    }
> +
> +    return n_cores;
> +}
> +
> +static int parse_runtime(const char *str)
> +{
> +    char *endptr;
> +    int v = strtol(str, &endptr, 10);
> +
> +    if (!*endptr) {
> +        return v;
> +    }
> +
> +    switch (*endptr) {
> +    case 'd':
> +    case 'D':
> +        /* Days */
> +        v *= 24;
> +    case 'h':
> +    case 'H':
> +        /* Hours */
> +        v *= 60;
> +    case 'm':
> +    case 'M':
> +        /* Minutes */
> +        v *= 60;
> +    case 's':
> +    case 'S':
> +        /* Seconds */
> +        break;
> +    default:
> +        printf("Unknown runtime suffix: %s\n", endptr);
> +        v = 0;
> +        break;
> +    }
> +
> +    return v;
> +}
> +
> +static int parse_mem_size(char *str, uint64_t *val)
> +{
> +    char *endptr;
> +    int v = strtol(str, &endptr, 10);
> +
> +    if (!*endptr) {
> +        return v;
> +    }
> +
> +    switch (*endptr) {
> +    case 'g':
> +    case 'G':
> +        v *= 1024;
> +    case 'm':
> +    case 'M':
> +        v *= 1024;
> +    case 'k':
> +    case 'K':
> +        v *= 1024;
> +    case 'b':
> +    case 'B':
> +        break;
> +    default:
> +        return -1;
> +    }
> +
> +    *val = v;
> +
> +    return 0;
> +}
> +
> +static int workload_select(char *name)
> +{
> +    int i = 0;
> +
> +    for (i = 0; i < WORKLOAD_NUM; i++) {
> +        if (!strcmp(name, workload_list[i].w_name)) {
> +            g.workload = &workload_list[i];
> +            return 0;
> +        }
> +    }
> +
> +    return -1;
> +}
> +
> +/* Process commandline options */
> +static void parse_options(int argc, char *argv[])
> +{
> +    while (1) {
> +        static struct option options[] = {
> +            { "bucket-size", required_argument, NULL, 'b' },
> +            { "cpu-list", required_argument, NULL, 'c' },
> +            { "cpu-main-thread", required_argument, NULL, 'C'},
> +            { "runtime", required_argument, NULL, 't' },
> +            { "rtprio", required_argument, NULL, 'f' },
> +            { "help", no_argument, NULL, 'h' },
> +            { "trace-threshold", required_argument, NULL, 'T' },
> +            { "workload", required_argument, NULL, 'w'},
> +            { "workload-mem", required_argument, NULL, 'm'},
> +            { "bias", no_argument, NULL, 'B'},
> +            { "single-preheat", no_argument, NULL, 's'},
> +            { "zero-omit", no_argument, NULL, 'u'},
> +            { "version", no_argument, NULL, 'v'},
> +            { NULL, 0, NULL, 0 },
> +        };
> +        int i, c = getopt_long(argc, argv, "b:Bc:C:f:hm:st:w:T:vz",
> +                               options, NULL);
> +        long ncores;
> +
> +        if (c == -1)
> +            break;
> +
> +        switch (c) {
> +        case 'b':
> +            g.bucket_size = strtol(optarg, NULL, 10);
> +            if (g.bucket_size > 1024 || g.bucket_size <= 4) {
> +                printf("Illegal bucket size: %s (should be: 4-1024)\n",
> +                       optarg);
> +                exit(1);
> +            }
> +            break;
> +        case 'B':
> +            g.enable_bias = 1;
> +            break;
> +        case 'c':
> +            g.cpu_list = strdup(optarg);
> +            break;
> +        case 'C':
> +            ncores = sysconf(_SC_NPROCESSORS_CONF);
> +            g.cpu_main_thread = strtol(optarg, NULL, 10);
> +            if (g.cpu_main_thread < 0 || g.cpu_main_thread > ncores) {
> +                printf("Illegal core for main thread: %s (should be: 0-%ld)\n",
> +                       optarg, ncores);
> +                exit(1);
> +            }
> +            break;
> +        case 't':
> +            g.runtime = parse_runtime(optarg);
> +            if (!g.runtime) {
> +                printf("Illegal runtime: %s\n", optarg);
> +                exit(1);
> +            }
> +            break;
> +        case 'f':
> +            g.rtprio = strtol(optarg, NULL, 10);
> +            if (g.rtprio < 1 || g.rtprio > 99) {
> +                printf("Illegal RT priority: %s (should be: 1-99)\n", optarg);
> +                exit(1);
> +            }
> +            break;
> +        case 'T':
> +            g.trace_threshold = strtol(optarg, NULL, 10);
> +            if (g.trace_threshold <= 0) {
> +                printf("Parameter --trace-threshold needs to be positive\n");
> +                exit(1);
> +            }
> +            enable_trace_mark();
> +            break;
> +        case 'w':
> +            if (workload_select(optarg)) {
> +                printf("Unknown workload '%s'.  Please choose from: ", optarg);
> +                for (i = 0; i < WORKLOAD_NUM; i++) {
> +                    printf("'%s'", workload_list[i].w_name);
> +                    if (i != WORKLOAD_NUM - 1) {
> +                        printf(", ");
> +                    }
> +                }
> +                printf("\n\n");
> +                exit(1);
> +            }
> +            break;
> +        case 'm':
> +            if (parse_mem_size(optarg, &g.workload_mem_size)) {
> +                printf("Unknown workload memory size '%s'.\n\n", optarg);
> +                exit(1);
> +            }
> +            break;
> +        case 's':
> +            /*
> +             * Only use one core for pre-heat.  Then if --bias is used, the
> +             * bias will be exactly the min value of the pre-heat core.
> +             */
> +            g.single_preheat_thread = true;
> +            break;
> +        case 'v':
> +            /*
> +             * Because we always dump the version even before parsing options,
> +             * what we need to do is to quit..
> +             */
> +            exit(0);
> +            break;
> +        case 'z':
> +            g.output_omit_zero_buckets = 1;
> +            break;
> +        default:
> +            usage();
> +            break;
> +        }
> +    }
> +}
> +
> +void dump_globals(void)
> +{
> +    printf("Total runtime: \t\t%d seconds\n", g.runtime);
> +    printf("Thread priority: \t");
> +    if (g.rtprio) {
> +        printf("SCHED_FIFO:%d\n", g.rtprio);
> +    } else {
> +        printf("default\n");
> +    }
> +    printf("CPU list: \t\t%s\n", g.cpu_list ?: "(all cores)");
> +    printf("CPU for main thread: \t%d\n", g.cpu_main_thread);
> +    printf("Workload: \t\t%s\n", g.workload->w_name);
> +    printf("Workload mem: \t\t%"PRIu64" (KiB)\n",
> +           (g.workload->w_flags & WORK_NEED_MEM) ?
> +           (g.workload_mem_size / 1024) : 0);
> +    printf("Preheat cores: \t\t%d\n", g.single_preheat_thread ?
> +           1 : g.n_threads_total);
> +    printf("\n");
> +}
> +
> +static void record_bias(struct thread *t)
> +{
> +    int i;
> +    uint64_t bias = (uint64_t)-1;
> +
> +    if (!g.enable_bias) {
> +        return;
> +    }
> +
> +    /* Record the min value of minlat on all the threads */
> +    for( i = 0; i < g.n_threads; ++i ) {
> +        if (t[i].minlat < bias) {
> +            bias = t[i].minlat;
> +        }
> +    }
> +    g.bias = bias;
> +    printf("Global bias set to %" PRId64 " (us)\n", bias);
> +}
> +
> +int main(int argc, char* argv[])
> +{
> +    struct thread* threads;
> +    int i, n_cores;
> +    cpu_set_t cpu_set;
> +
> +    CPU_ZERO(&cpu_set);
> +
> +    g.app_name = argv[0];
> +    g.rtprio = 0;
> +    g.bucket_size = BUCKET_SIZE;
> +    g.runtime = 1;
> +    g.workload = &workload_list[WORKLOAD_DEFUALT];
> +    g.workload_mem_size = WORKLOAD_MEM_SIZE;
> +    /* Run the main thread on cpu0 by default */
> +    g.cpu_main_thread = 0;
> +
> +    printf("\nVersion: %1.2f\n\n", VERSION);
> +
> +    parse_options(argc, argv);
> +
> +    TEST(mlockall(MCL_CURRENT | MCL_FUTURE) == 0);
> +
> +    n_cores = parse_cpu_list(g.cpu_list, &cpu_set);
> +
> +    TEST( threads = calloc(1, CPU_COUNT(&cpu_set) * sizeof(threads[0])) );
> +    for( i = 0; i < n_cores; ++i )
> +        if (CPU_ISSET(i, &cpu_set) && move_to_core(i) == 0)
> +            threads[g.n_threads_total++].core_i = i;
> +
> +    if (CPU_ISSET(0, &cpu_set) && g.rtprio) {
> +        printf("WARNING: Running SCHED_FIFO workload on CPU 0 "
> +               "may hang the main thread\n");
> +    }
> +
> +    TEST(move_to_core(g.cpu_main_thread) == 0);
> +
> +    signal(SIGALRM, handle_alarm);
> +    signal(SIGINT, handle_alarm);
> +    signal(SIGTERM, handle_alarm);
> +
> +    dump_globals();
> +
> +    printf("Pre-heat for 1 seconds...\n");
> +    if (g.single_preheat_thread) {
> +        g.n_threads = 1;
> +    } else {
> +        g.n_threads = g.n_threads_total;
> +    }
> +    run_expt(threads, 1);
> +    record_bias(threads);
> +
> +    printf("Test starts...\n");
> +    /* Reset n_threads to always run on all the cores */
> +    g.n_threads = g.n_threads_total;
> +    run_expt(threads, g.runtime);
> +
> +    printf("Test completed.\n\n");
> +
> +    write_summary(threads);
> +
> +    if (g.cpu_list) {
> +        free(g.cpu_list);
> +        g.cpu_list = NULL;
> +    }
> +
> +    return 0;
> +}
> -- 
> 2.26.2
> 
> 

- I made a few quick edits to the grammar in the man page, but other than 
that
Signed-off-by: John Kacur <jkacur@xxxxxxxxxx>