Attached user space program I used to see the difference. Usage: gcc -02 -o strscpy strscpy_test.c ./strscpy {b|w} src_str_len count src_str_len - length of source string in between 1-4096 count - how many strscpy() to execute. Also I've noticed something strange. I'm not sure why, but certain src_len values (e.g. 30) drives branch predictor crazy causing worse than usual results for byte-at-a-time copy: $ perf stat ./strscpy b 29 10000000 Performance counter stats for './strscpy b 29 10000000': 165.354974 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 48 page-faults:u # 0.290 K/sec 640,475,981 cycles:u # 3.873 GHz 2,500,090,080 instructions:u # 3.90 insn per cycle 640,017,126 branches:u # 3870.565 M/sec 1,589 branch-misses:u # 0.00% of all branches 0.165568346 seconds time elapsed Performance counter stats for './strscpy b 30 10000000': 250.835659 task-clock:u (msec) # 0.999 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 46 page-faults:u # 0.183 K/sec 974,528,780 cycles:u # 3.885 GHz 2,580,090,165 instructions:u # 2.65 insn per cycle 660,017,211 branches:u # 2631.273 M/sec 14,488,234 branch-misses:u # 2.20% of all branches 0.251147341 seconds time elapsed Performance counter stats for './strscpy b 31 10000000': 176.598368 task-clock:u (msec) # 0.997 CPUs utilized 0 context-switches:u # 0.000 K/sec 0 cpu-migrations:u # 0.000 K/sec 46 page-faults:u # 0.260 K/sec 681,367,948 cycles:u # 3.858 GHz 2,660,090,092 instructions:u # 3.90 insn per cycle 680,017,138 branches:u # 3850.642 M/sec 1,817 branch-misses:u # 0.00% of all branches 0.177150181 seconds time elapsed
#include <stdlib.h> #include <string.h> #define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) #define E2BIG -1 #define PAGE_SIZE 4096 struct word_at_a_time { const unsigned long one_bits, high_bits; }; #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } static inline long count_masked_bytes(unsigned long mask) { return mask*0x0001020304050608ul >> 56; } static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) { unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits; *bits = mask; return mask; } static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c) { return bits; } static inline unsigned long create_zero_mask(unsigned long bits) { bits = (bits - 1) & ~bits; return bits >> 7; } /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) static inline unsigned long find_zero(unsigned long mask) { return count_masked_bytes(mask); } __attribute__((noinline)) int strscpy_word(char *dest, const char *src, size_t count) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; size_t max = count; long res = 0; if (count == 0) return -E2BIG; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* * If src is unaligned, don't cross a page boundary, * since we don't know if the next page is mapped. */ if ((long)src & (sizeof(long) - 1)) { size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1)); if (limit < max) max = limit; } #else /* If src or dest is unaligned, don't do word-at-a-time. */ if (((long) dest | (long) src) & (sizeof(long) - 1)) max = 0; #endif while (max >= sizeof(unsigned long)) { unsigned long c, data; c = *(unsigned long *)(src+res); if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); *(unsigned long *)(dest+res) = c & zero_bytemask(data); return res + find_zero(data); } *(unsigned long *)(dest+res) = c; res += sizeof(unsigned long); count -= sizeof(unsigned long); max -= sizeof(unsigned long); } while (count) { char c; c = src[res]; dest[res] = c; if (!c) return res; res++; count--; } /* Hit buffer length without finding a NUL; force NUL-termination. */ if (res) dest[res-1] = '\0'; return -E2BIG; } __attribute__((noinline)) int strscpy_byte(char *dest, const char *src, int count) { int res = 0; while (count) { char c; c = src[res]; dest[res] = c; if (!c) return res; res++; count--; } /* Hit buffer length without finding a NUL; force NUL-termination. */ if (res) dest[res-1] = '\0'; return -E2BIG; } char dest[4096] __attribute__((aligned(4096))); char src[4096] __attribute__((aligned(4096))); int main(int argc, char **argv) { unsigned long long i; unsigned long src_len; unsigned long count; if (argc < 4) return -1; src_len = atoi(argv[2]); count = atoi(argv[3]); memset(src, 1, src_len); if (argv[1][0] == 'w') { for (i = 0; i < count; i++) { strscpy_word(dest, src, sizeof(dest)); } } else if (argv[1][0] == 'b') { for (i = 0; i < count; i++) { strscpy_byte(dest, src, sizeof(dest)); } } return 0; }