On Fri, Apr 12, 2013 at 09:55:56AM +0200, Peter Zijlstra wrote: > > The above is totally untested, but each step is pretty damn simple and > > fairly cheap. Sure, it's a loop, but it's bounded to 32 (cheap) > > iterations, and the normal case is that it's not done at all, or done > > only a few times. > > Right it gets gradually heavier the bigger the numbers get; which is > more and more unlikely. > > > And the advantage is that the end result is always that simple > > 32x32/32 case that we started out with as the common case. > > > > I dunno. Maybe I'm overlooking something, and the above is horrible, > > but the above seems reasonably efficient if not optimal, and > > *understandable*. > > I suppose that entirely matters on what one is used to ;-) I had to > stare rather hard at it for a little while. > > But yes, you take it one step further and are willing to ditch rtime > bits too and I suppose that's fine. > > Should work,.. Stanislaw could you stick this into your userspace > thingy and verify the numbers are sane enough? It works fine - gives relative error less than 0.1% for very big numbers. For the record I'm attaching test program and script. Thanks Stanislaw
#include <stdio.h> #include <stdlib.h> #include <assert.h> #include <strings.h> #include <stdint.h> typedef uint64_t u64; typedef uint32_t u32; static u64 div_u64_u32(u64 a, u32 b) { return a / b; } static u64 scale_stime(u64 stime, u64 rtime, u64 total) { /* We know one of the values has a bit set in the high 32 bits */ for (;;) { /* Make sure "rtime" is the bigger of stime/rtime */ if (stime > rtime) { u64 tmp = rtime; rtime = stime; stime = tmp; } /* Do we need to balance stime/rtime bits? */ if (rtime >> 32) { if (stime >> 31) goto drop_precision; /* We can grow rtime and shrink stime and try to make them both fit */ stime <<= 1; rtime >>= 1; continue; } /* stime/rtime fits in 32 bits, how about total? */ if (!(total >> 32)) break; drop_precision: /* We drop from rtime, it has more bits than stime */ rtime >>= 1; total >>= 1; } if (!total) return stime; /* Make sure gcc understands that this is a 32x32->64 multiply, * followed by a 64/32->64 divide */ return div_u64_u32((u64) (u32) stime * (u64) (u32) rtime, (u32)total); } int main(int argc, char *argv[]) { u64 rtime, total, stime, scaled; if (argc != 4) return; rtime = strtoll(argv[1], NULL, 10); total = strtoll(argv[2], NULL, 10); stime = strtoll(argv[3], NULL, 10); assert (total >= stime); scaled = scale_stime(stime, rtime, total); printf("%llu\n", scaled); return 0; }
#!/usr/bin/python import subprocess import random import math def kernel_scale (rtime, total, stime): p = subprocess.Popen("./scale_stime5 " + str(rtime) + " " + str(total) + " " + str(stime) , shell=True, stdout=subprocess.PIPE) return int(p.stdout.read()) def python_scale (rtime, total, stime): return (stime * rtime) / total max_rtime = 10*4096*364*24*60*60*1000; # 10 years for 4096 threads fail=False K=10000 for i in range(0, K): rtime = random.randrange(max_rtime) total = int(random.uniform(0.1, 1.9) * rtime) for n in range(1, 100): stime = (n * total / 100) r1 = kernel_scale(rtime, total, stime) r2 = python_scale(rtime, total, stime) if (float(abs(r1 - r2)) / float(r2)) > 0.001: print "FAIL!" print "rtime: " + str(rtime) print "total: " + str(total) print "stime: " + str(stime) print "kernel: " + str(r1) print "python: " + str(r2) fail=True break if fail: break; if (i % 100) == 99: print str(i/100) + "/" + str(K/100) + " OK"