On Thu, Feb 02, 2006 at 02:26:38AM +0100, Gabriel Paubert wrote: > > The first step can be implemented slightly better: > > unsigned int res = w-((w>>1)&0x55555555); > Yes. I've got many advices about hweight speedup. static unsigned int hweight32(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x55555555); res = (res & 0x33333333) + ((res >> 2) & 0x33333333); res = (res + (res >> 4)) & 0x0F0F0F0F; res = res + (res >> 8); return (res + (res >> 16)) & 0x000000FF; } static unsigned int hweight16(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x5555); res = (res & 0x3333) + ((res >> 2) & 0x3333); res = (res + (res >> 4)) & 0x0F0F; return (res + (res >> 8)) & 0x00FF; } static unsigned int hweight8(unsigned int w) { unsigned int res = w - ((w >> 1) & 0x55); res = (res & 0x33) + ((res >> 2) & 0x33); return (res + (res >> 4)) & 0x0F; } static unsigned long hweight64(__u64 w) { #if BITS_PER_LONG < 64 return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w); #else __u64 res = w - ((w >> 1) & 0x5555555555555555ul); res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful; res = res + (res >> 8); res = res + (res >> 16); return (res + (res >> 32)) & 0x00000000000000FFul; #endif }