Re: [PATCH] lib/strscpy: remove word-at-a-time optimization.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Attached user space program I used to see the difference.
Usage:
	gcc -02 -o strscpy strscpy_test.c
	./strscpy {b|w} src_str_len count

src_str_len - length of source string in between 1-4096
count - how many strscpy() to execute.



Also I've noticed something strange. I'm not sure why, but certain
src_len values (e.g. 30) drives branch predictor crazy causing worse than usual results
for byte-at-a-time copy:

$ perf stat   ./strscpy b 29 10000000

 Performance counter stats for './strscpy b 29 10000000':

        165.354974      task-clock:u (msec)       #    0.999 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  
                48      page-faults:u             #    0.290 K/sec                  
       640,475,981      cycles:u                  #    3.873 GHz                    
     2,500,090,080      instructions:u            #    3.90  insn per cycle         
       640,017,126      branches:u                # 3870.565 M/sec                  
             1,589      branch-misses:u           #    0.00% of all branches        

       0.165568346 seconds time elapsed



 Performance counter stats for './strscpy b 30 10000000':

        250.835659      task-clock:u (msec)       #    0.999 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  
                46      page-faults:u             #    0.183 K/sec                  
       974,528,780      cycles:u                  #    3.885 GHz                    
     2,580,090,165      instructions:u            #    2.65  insn per cycle         
       660,017,211      branches:u                # 2631.273 M/sec                  
        14,488,234      branch-misses:u           #    2.20% of all branches        

       0.251147341 seconds time elapsed


 Performance counter stats for './strscpy b 31 10000000':

        176.598368      task-clock:u (msec)       #    0.997 CPUs utilized          
                 0      context-switches:u        #    0.000 K/sec                  
                 0      cpu-migrations:u          #    0.000 K/sec                  
                46      page-faults:u             #    0.260 K/sec                  
       681,367,948      cycles:u                  #    3.858 GHz                    
     2,660,090,092      instructions:u            #    3.90  insn per cycle         
       680,017,138      branches:u                # 3850.642 M/sec                  
             1,817      branch-misses:u           #    0.00% of all branches        

       0.177150181 seconds time elapsed

#include <stdlib.h>
#include <string.h>

#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS

#define REPEAT_BYTE(x)	((~0ul / 0xff) * (x))

#define E2BIG -1
#define PAGE_SIZE 4096
struct word_at_a_time {
	const unsigned long one_bits, high_bits;
};

#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
static inline long count_masked_bytes(unsigned long mask)
{
	return mask*0x0001020304050608ul >> 56;
}
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
	unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
	*bits = mask;
	return mask;
}

static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
	return bits;
}

static inline unsigned long create_zero_mask(unsigned long bits)
{
	bits = (bits - 1) & ~bits;
	return bits >> 7;
}

/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)

static inline unsigned long find_zero(unsigned long mask)
{
	return count_masked_bytes(mask);
}

__attribute__((noinline))
int strscpy_word(char *dest, const char *src, size_t count)
{
	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
	size_t max = count;
	long res = 0;

	if (count == 0)
		return -E2BIG;

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	/*
	 * If src is unaligned, don't cross a page boundary,
	 * since we don't know if the next page is mapped.
	 */
	if ((long)src & (sizeof(long) - 1)) {
		size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));
		if (limit < max)
			max = limit;
	}
#else
	/* If src or dest is unaligned, don't do word-at-a-time. */
	if (((long) dest | (long) src) & (sizeof(long) - 1))
		max = 0;
#endif

	while (max >= sizeof(unsigned long)) {
		unsigned long c, data;

		c = *(unsigned long *)(src+res);
		if (has_zero(c, &data, &constants)) {
			data = prep_zero_mask(c, data, &constants);
			data = create_zero_mask(data);
			*(unsigned long *)(dest+res) = c & zero_bytemask(data);
			return res + find_zero(data);
		}
		*(unsigned long *)(dest+res) = c;
		res += sizeof(unsigned long);
		count -= sizeof(unsigned long);
		max -= sizeof(unsigned long);
	}

	while (count) {
		char c;

		c = src[res];
		dest[res] = c;
		if (!c)
			return res;
		res++;
		count--;
	}

	/* Hit buffer length without finding a NUL; force NUL-termination. */
	if (res)
		dest[res-1] = '\0';

	return -E2BIG;
}

__attribute__((noinline))
int strscpy_byte(char *dest, const char *src, int count)
{
	int res = 0;

	while (count) {
		char c;
		c = src[res];
		dest[res] = c;
		if (!c)
			return res;
		res++;
		count--;

	}

	/* Hit buffer length without finding a NUL; force NUL-termination. */
	if (res)
		dest[res-1] = '\0';

	return -E2BIG;
}

char dest[4096] __attribute__((aligned(4096)));
char src[4096] __attribute__((aligned(4096)));

int main(int argc, char **argv)
{
	unsigned long long i;
	unsigned long src_len;
	unsigned long count;

	if (argc < 4)
		return -1;

	src_len = atoi(argv[2]);
	count = atoi(argv[3]);

	memset(src, 1, src_len);

	if (argv[1][0] == 'w') {
		for (i = 0; i < count; i++) {
			strscpy_word(dest, src, sizeof(dest));
		}
	} else if (argv[1][0] == 'b') {
		for (i = 0; i < count; i++) {
			strscpy_byte(dest, src, sizeof(dest));
		}
	}
	return 0;
}

[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]