On Wed, Dec 07, 2022 at 02:22:54PM +0200, Vladimir Oltean wrote: > Wait a second, I deliberately wrote the code without conditionals. > Let me look at the code disassembly before and after the patch and see > what they look like. Before (make lib/packing.lst on arm64): for (i = 0; i < width; i++) { 1c: 310004ec adds w12, w7, #0x1 20: 540001c0 b.eq 58 <adjust_for_msb_right_quirk+0x58> // b.none 24: 52800004 mov w4, #0x0 // #0 bit = (val & (1 << i)) != 0; 28: 5280002b mov w11, #0x1 // #1 2c: d503201f nop 30: 1ac42166 lsl w6, w11, w4 new_val |= (bit << (width - i - 1)); 34: 4b0400e9 sub w9, w7, w4 bit = (val & (1 << i)) != 0; 38: 93407cc6 sxtw x6, w6 /* We don't want sign extension */ 3c: ea0a00df tst x6, x10 40: 1a9f07e6 cset w6, ne // ne = any for (i = 0; i < width; i++) { 44: 6b07009f cmp w4, w7 48: 11000484 add w4, w4, #0x1 new_val |= (bit << (width - i - 1)); 4c: 1ac920c6 lsl w6, w6, w9 50: aa060108 orr x8, x8, x6 for (i = 0; i < width; i++) { 54: 54fffee1 b.ne 30 <adjust_for_msb_right_quirk+0x30> // b.any Then this modified code: static u64 bit_reverse(u64 val, unsigned int width) { u64 new_val = 0; unsigned int i; for (i = 0; i < width; i++) if (val & BIT_ULL(i)) new_val |= BIT_ULL(width - i - 1); return new_val; } compiles to this: for (i = 0; i < width; i++) { 1c: 310004eb adds w11, w7, #0x1 20: 540001c0 b.eq 58 <adjust_for_msb_right_quirk+0x58> // b.none 24: 52800005 mov w5, #0x0 // #0 new_val |= BIT_ULL(width - i - 1); 28: d280002a mov x10, #0x1 // #1 2c: 14000002 b 34 <adjust_for_msb_right_quirk+0x34> /* this is an unconditional jump to "sub w4, w7, w5", skipping the assignment to w5 */ 30: 2a0803e5 mov w5, w8 /* this is only hit by the backwards jump b.ne 30 at the end */ 34: 4b0500e4 sub w4, w7, w5 if (val & BIT_ULL(i)) 38: 9ac52528 lsr x8, x9, x5 new_val |= BIT_ULL(width - i - 1); 3c: f240011f tst x8, #0x1 for (i = 0; i < width; i++) { 40: 110004a8 add w8, w5, #0x1 new_val |= BIT_ULL(width - i - 1); 44: 9ac42144 lsl x4, x10, x4 48: aa0400c4 orr x4, x6, x4 4c: 9a861086 csel x6, x4, x6, ne // ne = any for (i = 0; i < width; i++) { 50: 6b0700bf cmp w5, w7 54: 54fffee1 b.ne 30 <adjust_for_msb_right_quirk+0x30> // b.any which indeed has no branch in the main loop (between 0x30 and 0x54), because as I explained, the "b 34" doesn't count - it's not in the loop. Additionally, this rewritten code which is considerably more obscure in C: static u64 bit_reverse(u64 val, unsigned int width) { u64 new_val = 0; unsigned int i; for (i = 0; i < width; i++) new_val |= !!(val & BIT_ULL(i)) ? BIT_ULL(width - i - 1) : 0; return new_val; } compiles to the exact same assembly code as the variant with "if": for (i = 0; i < width; i++) 1c: 310004eb adds w11, w7, #0x1 20: 540001c0 b.eq 58 <adjust_for_msb_right_quirk+0x58> // b.none 24: 52800005 mov w5, #0x0 // #0 new_val |= !!(val & BIT_ULL(i)) ? 28: d280002a mov x10, #0x1 // #1 2c: 14000002 b 34 <adjust_for_msb_right_quirk+0x34> 30: 2a0803e5 mov w5, w8 34: 4b0500e4 sub w4, w7, w5 38: 9ac52528 lsr x8, x9, x5 3c: f240011f tst x8, #0x1 for (i = 0; i < width; i++) 40: 110004a8 add w8, w5, #0x1 new_val |= !!(val & BIT_ULL(i)) ? 44: 9ac42144 lsl x4, x10, x4 48: aa0400c4 orr x4, x6, x4 4c: 9a861086 csel x6, x4, x6, ne // ne = any for (i = 0; i < width; i++) 50: 6b0700bf cmp w5, w7 54: 54fffee1 b.ne 30 <adjust_for_msb_right_quirk+0x30> // b.any So this is good with the "if".