RE: fiologparser_hist.py script patch and enhancements?

Kris Davis <Kris.Davis@xxxxxxx> · Thu, 15 Mar 2018 17:02:11 +0000

Sorry I've been unavailable for a couple of weeks. 

Chose #2: 
2. Keep one script but somehow let it contain non-weighted vs weighted paths.

I merged the changes from my "no weighted" version with the original script, and added a "--noweight" option.    I didn't try to combined the file i/o methods, but just split the "main" function latter operation into "output_interval_data" and "output_weighted_interval_data", for lack of more creative names :).

Below is the new patch 
Thanks
Kris Davis

diff --git a/tools/hist/fiologparser_hist.py b/tools/hist/fiologparser_hist.py
index 62a4eb4..7acd2d3
--- a/tools/hist/fiologparser_hist.py
+++ b/tools/hist/fiologparser_hist.py
@@ -16,9 +16,49 @@
 import os
 import sys
 import pandas
+import re
 import numpy as np
 
+runascmd = False
+
 err = sys.stderr.write
+
+class HistFileRdr():
+    """ Class to read a hist file line by line, buffering 
+        a value array for the latest line, and allowing a preview
+        of the next timestamp in next line
+        Note: this does not follow a generator pattern, but must explicitly
+        get next bin array.
+    """
+    def __init__(self, file):
+        self.fp = open(file, 'r')
+        self.data = self.nextData()
+        
+    def close(self):
+        self.fp.close()
+        self.fp = None
+        
+    def nextData(self):
+        self.data = None
+        if self.fp: 
+            line = self.fp.readline()
+            if line == "":
+                self.close()
+            else:
+                self.data = [int(x) for x in line.replace(' ', '').rstrip().split(',')]
+                
+        return self.data
+ 
+    @property
+    def curTS(self):
+        ts = None
+        if self.data:
+            ts = self.data[0]
+        return ts
+             
+    @property
+    def curBins(self):
+        return self.data[3:]
 
 def weighted_percentile(percs, vs, ws):
     """ Use linear interpolation to calculate the weighted percentile.
@@ -42,7 +82,7 @@
     """ Calculate weights based on fraction of sample falling in the
         given interval [start,end]. Weights computed using vector / array
         computation instead of for-loops.
-    
+
         Note that samples with zero time length are effectively ignored
         (we set their weight to zero).
 
@@ -64,8 +104,18 @@
 def weighted_average(vs, ws):
     return np.sum(vs * ws) / np.sum(ws)
 
-columns = ["end-time", "samples", "min", "avg", "median", "90%", "95%", "99%", "max"]
-percs   = [50, 90, 95, 99]
+
+percs = None
+columns = None
+
+def gen_output_columns(percentiles):
+    global percs,columns
+    strpercs = re.split('[,:]', percentiles)
+    percs = [50.0]  # always print 50% in 'median' column
+    percs.extend(list(map(float,strpercs)))
+    columns = ["end-time", "samples", "min", "avg", "median"]
+    columns.extend(list(map(lambda x: x+'%', strpercs)))
+    columns.append("max")
 
 def fmt_float_list(ctx, num=1):
   """ Return a comma separated list of float formatters to the required number
@@ -80,7 +130,7 @@
 __HIST_COLUMNS = 1216
 __NON_HIST_COLUMNS = 3
 __TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS
-    
+
 def read_chunk(rdr, sz):
     """ Read the next chunk of size sz from the given reader. """
     try:
@@ -88,7 +138,7 @@
             occurs if rdr is None due to the file being empty. """
         new_arr = rdr.read().values
     except (StopIteration, AttributeError):
-        return None    
+        return None
 
     """ Extract array of just the times, and histograms matrix without times column. """
     times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
@@ -178,7 +228,11 @@
     avg = weighted_average(vs, ws)
     values = [mn, avg] + list(ps) + [mx]
     row = [end, ss_cnt] + [float(x) / ctx.divisor for x in values]
-    fmt = "%d, %d, %d, " + fmt_float_list(ctx, 5) + ", %d"
+    if ctx.divisor > 1:
+        fmt = "%d, %d, " + fmt_float_list(ctx, len(percs)+3)
+    else:
+        # max and min are decimal values if no divisor
+        fmt = "%d, %d, %d, " + fmt_float_list(ctx, len(percs)+1) + ", %d"
     print (fmt % tuple(row))
 
 def update_extreme(val, fncn, new_val):
@@ -191,33 +245,51 @@
 lower_bin_vals = [] # lower edge of each bin
 upper_bin_vals = [] # upper edge of each bin 
 
-def process_interval(ctx, samples, iStart, iEnd):
+def process_interval(ctx, iHist, iEnd):
+    """ print estimated percentages for the given merged sample
+    """
+    ss_cnt = 0 # number of samples affecting this interval
+    mn_bin_val, mx_bin_val = None, None
+   
+    # Update total number of samples affecting current interval histogram:
+    ss_cnt += np.sum(iHist)
+        
+    # Update min and max bin values
+    idxs = np.nonzero(iHist != 0)[0]
+    if idxs.size > 0:
+        mn_bin_val = bin_vals[idxs[0]]
+        mx_bin_val = bin_vals[idxs[-1]]
+
+    if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val)
+
+
+def process_weighted_interval(ctx, samples, iStart, iEnd):
     """ Construct the weighted histogram for the given interval by scanning
         through all the histograms and figuring out which of their bins have
         samples with latencies which overlap with the given interval
         [iStart,iEnd].
     """
-    
+
     times, files, hists = samples[:,0], samples[:,1], samples[:,2:]
     iHist = np.zeros(__HIST_COLUMNS)
     ss_cnt = 0 # number of samples affecting this interval
     mn_bin_val, mx_bin_val = None, None
 
     for end_time,file,hist in zip(times,files,hists):
-            
+
         # Only look at bins of the current histogram sample which
         # started before the end of the current time interval [start,end]
-        start_times = (end_time - 0.5 * ctx.interval) - bin_vals / 1000.0
+        start_times = (end_time - 0.5 * ctx.interval) - bin_vals / ctx.time_divisor
         idx = np.where(start_times < iEnd)
         s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]
 
         # Increment current interval histogram by weighted values of future histogram:
         ws = hs * weights(s_ts, end_time, iStart, iEnd)
         iHist[idx] += ws
-    
+
         # Update total number of samples affecting current interval histogram:
         ss_cnt += np.sum(hs)
-        
+
         # Update min and max bin values seen if necessary:
         idx = np.where(hs != 0)[0]
         if idx.size > 0:
@@ -241,7 +313,7 @@
     idx = np.where(arr == hist_cols)
     if len(idx[1]) == 0:
         table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array','     ')
-        err("Unable to determine bin values from input clat_hist files. Namely \n"
+        errmsg = ("Unable to determine bin values from input clat_hist files. Namely \n"
             "the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
             "columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
             "This number needs to be equal to one of the following numbers:\n\n"
@@ -250,47 +322,15 @@
             "  - Input file(s) does not contain histograms.\n"
             "  - You recompiled fio with a different GROUP_NR. If so please specify this\n"
             "    new GROUP_NR on the command line with --group_nr\n")
-        exit(1)
+        if runascmd:
+            err(errmsg)
+            exit(1)
+        else:
+            raise RuntimeError(errmsg) 
+        
     return bins[idx[1][0]]
 
-def main(ctx):
-
-    if ctx.job_file:
-        try:
-            from configparser import SafeConfigParser, NoOptionError
-        except ImportError:
-            from ConfigParser import SafeConfigParser, NoOptionError
-
-        cp = SafeConfigParser(allow_no_value=True)
-        with open(ctx.job_file, 'r') as fp:
-            cp.readfp(fp)
-
-        if ctx.interval is None:
-            # Auto detect --interval value
-            for s in cp.sections():
-                try:
-                    hist_msec = cp.get(s, 'log_hist_msec')
-                    if hist_msec is not None:
-                        ctx.interval = int(hist_msec)
-                except NoOptionError:
-                    pass
-
-    if ctx.interval is None:
-        ctx.interval = 1000
-
-    # Automatically detect how many columns are in the input files,
-    # calculate the corresponding 'coarseness' parameter used to generate
-    # those files, and calculate the appropriate bin latency values:
-    with open(ctx.FILE[0], 'r') as fp:
-        global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
-        __TOTAL_COLUMNS = len(fp.readline().split(','))
-        __HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS
-
-        max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
-        coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
-        bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness) for x in np.arange(__HIST_COLUMNS)], dtype=float)
-        lower_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 0.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
-        upper_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 1.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+def output_weighted_interval_data(ctx):
 
     fps = [open(f, 'r') for f in ctx.FILE]
     gen = histogram_generator(ctx, fps, ctx.buff_size)
@@ -322,7 +362,7 @@
                     start = start - (start % ctx.interval)
                     end = start + ctx.interval
 
-                process_interval(ctx, arr, start, end)
+                process_weighted_interval(ctx, arr, start, end)
                 
                 # Update arr to throw away samples we no longer need - samples which
                 # end before the start of the next interval, i.e. the end of the
@@ -335,10 +375,112 @@
     finally:
         for fp in fps:
             fp.close()
+ 
+def output_interval_data(ctx):
+    fps = [HistFileRdr(f) for f in ctx.FILE]
+
+    print(', '.join(columns))
+
+    start = 0
+    end = ctx.interval
+    while True:
+        
+        more_data = False
+        
+        # add bins from all files in target intervals
+        arr = None
+        numSamples = 0
+        while True:
+            foundSamples = False
+            for fp in fps:
+                ts = fp.curTS
+                if ts and ts+10 < end:  # shift sample time when very close to an end time                 
+                    numSamples += 1
+                    foundSamples = True
+                    if arr is None: 
+                        arr = np.zeros(shape=(__HIST_COLUMNS), dtype=int)
+                    arr = np.add(arr, fp.curBins)
+                    more_data = True
+                    fp.nextData()
+                elif ts:
+                    more_data = True
+            
+            # reached end of all files
+            # or gone through all files without finding sample in interval 
+            if not more_data or not foundSamples:
+                break
+        
+        if arr is not None:
+            #print("{} size({}) samples({}) nonzero({}):".format(end, arr.size, numSamples, np.count_nonzero(arr)), str(arr), )
+            process_interval(ctx, arr, end)         
+        
+        # reach end of all files
+        if not more_data:
+            break
+            
+        start += ctx.interval
+        end = start + ctx.interval
+
+ 
+def main(ctx):
+
+    if ctx.job_file:
+        try:
+            from configparser import SafeConfigParser, NoOptionError
+        except ImportError:
+            from ConfigParser import SafeConfigParser, NoOptionError
+
+        cp = SafeConfigParser(allow_no_value=True)
+        with open(ctx.job_file, 'r') as fp:
+            cp.readfp(fp)
+
+        if ctx.interval is None:
+            # Auto detect --interval value
+            for s in cp.sections():
+                try:
+                    hist_msec = cp.get(s, 'log_hist_msec')
+                    if hist_msec is not None:
+                        ctx.interval = int(hist_msec)
+                except NoOptionError:
+                    pass
+    
+    if not hasattr(ctx, 'percentiles'):
+        ctx.percentiles = "90,95,99"
+    gen_output_columns(ctx.percentiles)
+
+    if ctx.interval is None:
+        ctx.interval = 1000
+
+    if ctx.usbin:
+        ctx.time_divisor = 1000.0        # bins are in us
+    else:
+        ctx.time_divisor = 1000000.0     # bins are in ns
+
+
+    # Automatically detect how many columns are in the input files,
+    # calculate the corresponding 'coarseness' parameter used to generate
+    # those files, and calculate the appropriate bin latency values:
+    with open(ctx.FILE[0], 'r') as fp:
+        global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
+        __TOTAL_COLUMNS = len(fp.readline().split(','))
+        __HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS
+
+        max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
+        coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
+        bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+        lower_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 0.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+        upper_bin_vals = np.array([plat_idx_to_val_coarse(x, coarseness, 1.0) for x in np.arange(__HIST_COLUMNS)], dtype=float)
+
+    
+    if ctx.noweight:
+        output_interval_data(ctx)
+    else:
+        output_weighted_interval_data(ctx)
 
 
 if __name__ == '__main__':
     import argparse
+    runascmd = True
     p = argparse.ArgumentParser()
     arg = p.add_argument
     arg("FILE", help='space separated list of latency log filenames', nargs='+')
@@ -355,6 +497,11 @@
     arg('-i', '--interval',
         type=int,
         help='interval width (ms), default 1000 ms')
+
+    arg('--noweight',
+        action='store_true',
+        default=False,
+        help='do not perform weighting of samples between output intervals')
 
     arg('-d', '--divisor',
         required=False,
@@ -385,5 +532,16 @@
              'given histogram files. Useful for auto-detecting --log_hist_msec and '
              '--log_unix_epoch (in fio) values.')
 
+    arg('--percentiles',
+        default="90:95:99",
+        type=str,
+        help='Optional argument of comma or colon separated percentiles to print. '
+             'The default is "90.0:95.0:99.0".  min, median(50%%) and max percentiles are always printed')
+    
+    arg('--usbin',
+        default=False,
+        action='store_true',
+        help='histogram bin latencies are in us (fio versions < 2.99. fio uses ns for version >= 2.99')      
+
     main(p.parse_args())

��.n��������+%������w��{.n�������^n�r������&��z�ޗ�zf���h���~����������_��+v���)ߣ�