Am 06 Sep 2001 10:19:28 +0200 schrieb =?ISO-8859-1?Q?Mattias Engdeg=E5rd?=: > This is completely portable, and should be a good deal faster than > conditionally adding each component separately, at least on modern > superscalar machines with expensive unpredicted branches. And benchmarks > confirm this I like it. I did some benchmarking with a few routines with different compilers on ppc and i686 and here are the results: egger@sonja:~/test > time ./testmat Time needed for padd_sat_4x8 in clocks: 5550000 Time needed for padd_sat_4x8_and in clocks: 6640000 Time needed for padd_sat_4x8_norm in clocks: 7090000 real 0m21.046s user 0m18.680s sys 0m0.650s Options to compile: /opt/experimental/bin/gcc -O3 -fssa -save-temps test.c -o testmat egger@sonja:~/test > time ./testmat Time needed for padd_sat_4x8 in clocks: 5550000 Time needed for padd_sat_4x8_and in clocks: 5840000 Time needed for padd_sat_4x8_norm in clocks: 7840000 Time needed for padd_sat_4x8_vec in clocks: 1780000 real 0m21.477s user 0m20.520s sys 0m0.530s Same machine but gcc-2.95.3 with Altivec support. Options to compile: /opt/gcc-altivec/bin/gcc -O3 -mcpu=7400 -fvec -save-temps test.c -o testmat egger@alex:~ > time ./testmat Time needed for padd_sat_4x8 in clocks: 8830000 Time needed for padd_sat_4x8_and in clocks: 10730000 Time needed for padd_sat_4x8_norm in clocks: 11010000 real 0m30.614s user 0m30.370s sys 0m0.210s This machine is a Duron-800 with 1GB RAM. I've no idea why it performs so poorly compared to the G4. The compile was gcc 2.95.3 with -march=i686 and -mcpu=i686 however the compiler didn't use the conditional move instructions from the higher Pentium CPUs which should have sped up the _norm case considerable as it is possible to do the same without branches. The source is attached, feel free to study it and provide faster code. At the moment it is pretty clear that Mattias code is pretty efficent and compiler equally well with several compilers on several architectures. Servus, Daniel
#include <glib-1.2/glib.h> #include <time.h> static guint32 dest[20000000] __attribute__ ((aligned (16))); static guint32 source1[20000000] __attribute__ ((aligned (16))); static guint32 source2[20000000] __attribute__ ((aligned (16))); inline void padd_sat_4x8(guint32 *dest, guint32 *pa, guint32 *pb) { guint32 a = *pa, b = *pb; guint32 ta, tb, tm, q, u, m; /* save overflow-causing bits in ta, tb */ ta = a & 0x80808080; tb = b & 0x80808080; q = a + b - (ta + tb); /* determine overflow conditions */ tm = ta | tb; u = (ta & tb) | (q & tm); /* u now contains overflow bits, propagate them over fields */ m = (u << 1) - (u >> 7); *dest = ((q + tm - u) | m); } inline void padd_sat_4x8_norm (guint32 *dest, guint32 *pa, guint32 *pb) { guint8 *newdest = (guint8 *) dest; guint16 dr, dg, db, da; guint8 r1 = *((guint8 *) (pa) + 0); guint8 g1 = *((guint8 *) (pa) + 1); guint8 b1 = *((guint8 *) (pa) + 2); guint8 a1 = *((guint8 *) (pa) + 3); guint8 r2 = *((guint8 *) (pb) + 0); guint8 g2 = *((guint8 *) (pb) + 1); guint8 b2 = *((guint8 *) (pb) + 2); guint8 a2 = *((guint8 *) (pb) + 3); dr = r1 + r2; dg = g1 + g2; db = b1 + b2; da = a1 + a2; newdest[0] = dr > 255 ? 255 : dr; newdest[1] = dg > 255 ? 255 : dg; newdest[2] = db > 255 ? 255 : db; newdest[3] = da > 255 ? 255 : da; } inline void padd_sat_4x8_and (guint32 *dest, guint32 *pa, guint32 *pb) { guint32 s1 = *pa, s2 = *pb; guint16 dr, dg, db, da; guint8 *newdest = (guint8 *) dest; guint8 scratch; dr = (s1 >> 24 ) & 0xff + (s2 >> 24) & 0xff; dg = (s1 >> 16) & 0xff + (s2 >> 16) & 0xff; db = (s1 >> 8) & 0xff + (s2 >> 8) & 0xff; da = s1 & 0xff + s2 & 0xff; newdest[0] = (guint8) (~((dr >> 8) - 1)) | dr; newdest[1] = (guint8) (~((dg >> 8) - 1)) | dg; newdest[2] = (guint8) (~((db >> 8) - 1)) | db; newdest[3] = (guint8) (~((da >> 8) - 1)) | da; } #ifdef __VEC__ inline void padd_sat_4x8_vec (guint32 *dest, guint32 *pa, guint32 *pb) { vector unsigned char vdest, source1, source2; source1 = vec_ld (0, (unsigned char *) pa); source2 = vec_ld (0, (unsigned char *) pb); vdest = vec_adds (source1, source2); vec_st (vdest, 0, (unsigned char *) dest); } #endif int main (void) { int i, current, iter; current = clock (); for (iter = 0; iter < 10; iter++) { for (i = 0; i < 20000000; i++) { padd_sat_4x8 (dest + i, source1 + i, source2 + i); } } current = clock () - current; printf("Time needed for padd_sat_4x8 in clocks: %i\n", current); current = clock (); for (iter = 0; iter < 10; iter++) { for (i = 0; i < 20000000; i++) { padd_sat_4x8_and (dest + i, source1 + i, source2 + i); } } current = clock () - current; printf("Time needed for padd_sat_4x8_and in clocks: %i\n", current); current = clock (); for (iter = 0; iter < 10; iter++) { for (i = 0; i < 20000000; i++) { padd_sat_4x8_norm (dest + i, source1 + i, source2 + i); } } current = clock () - current; printf("Time needed for padd_sat_4x8_norm in clocks: %i\n", current); #ifdef __VEC__ current = clock (); for (iter = 0; iter < 10; iter++) { for (i = 0; i < 20000000; i+=4) { padd_sat_4x8_vec (dest + i, source1 + i, source2 + i); } } current = clock () - current; printf("Time needed for padd_sat_4x8_vec in clocks: %i\n", current); #endif }