RE: [PATCH 04/18] csum_and_copy_..._user(): pass 0xffffffff instead of 0 as initial sum

David Laight <David.Laight@xxxxxxxxxx> · Thu, 23 Jul 2020 13:54:47 +0000

From: Al Viro
> Sent: 22 July 2020 18:39
> I would love to see your patch, anyway, along with the testcases and performance
> comparison.

See attached program.
Compile and run (as root): csum_iov 1

Unpatched (as shipped) 16 vectors of 1 byte take ~430 clocks on my haswell cpu.
With dsl_patch defined they take ~393.

The maximum throughput is ~1.16 clocks/word for 16 vectors of 1k.
For longer vectors the data gets lost from the cache between the iterations.

On an older Ivy Bridge cpu it never goes faster than 2 clocks/word.
(Due to the implementation of ADC.)

The absolute limit is 1 clock/word - limited by the memory write.
I suspect that is achievable on Haswell with much less loop unrolling.

I had to replace the ror32() with __builtin_bswap32().
The kernel object do contain the 'ror' instruction - even though I
didn't find the asm for it.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
/* Test program for checksum+copy
 *
 * Executes csum_and_copy_from_iter() in userspace.
 * Uses PERF_COUNT_HW_CPU_CYCLES to see how fast it runs.
 * Always copies i6 copies of the same buffer to the target.
 * Length of each fragment taken from argv[0].
 *
 * It needs linking with a copy of csum-copy_64.o (eg from a kernel build).
 *
 * For large buffers the 'adc' loop dominates.
 * On anything prior to Haswell this is 2 clocks per adc.
 * On Haswell adc is faster and it seems to approach 1.16 clocks/word.
 * It ought to be possibly to get to 1 clock/word on Ivy bridge (Sandy?)
 * or later.
 */
// define for my version
// #define dsl_patch

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>

#include <linux/perf_event.h>
#include <sys/mman.h>
#include <sys/syscall.h>

#define likely(x) (x)
#define unlikely(x) (x)

typedef uint32_t __wsum;

struct kvec {
	size_t iov_len;
	void   *iov_base;
};

struct iov_iter {
	unsigned int count;
	unsigned int nr_segs;
	const struct kvec *kvec;
	size_t       iov_offset;
};

#define min(a,b) ((a) < (b) ? (a) : (b))

static unsigned short fold(unsigned int csum)
{
	csum = (csum & 0xffff) + (csum >> 16);
	return csum + (csum >> 16);
}

extern __wsum csum_partial_copy_generic(const void *, void *, size_t, __wsum, void *, void *);

__wsum
csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
{
        return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
}

static inline unsigned add32_with_carry(unsigned a, unsigned b)
{
	asm("addl %2,%0\n\t"
	    "adcl $0,%0"
	    : "=r" (a)
	    : "0" (a), "rm" (b));
	return a;
}

static inline __wsum csum_add(__wsum csum, __wsum addend)
{
	return add32_with_carry(csum, addend);
}

static inline __wsum
csum_block_add(__wsum csum, __wsum sum, int offset)
{
        /* rotate sum to align it with a 16b boundary */
        if (offset & 1)
                sum = __builtin_bswap32(sum);

        return csum_add(csum, sum);
}
//////////////////////////////////////////////////////////////////////

/* Necessary bits from iov_iter.c */

#define iterate_kvec(i, n, __v, __p, skip, STEP) {	\
	size_t wanted = n;				\
	__p = i->kvec;					\
	__v.iov_len = min(n, __p->iov_len - skip);	\
	if (likely(__v.iov_len)) {			\
		__v.iov_base = __p->iov_base + skip;	\
		(void)(STEP);				\
		skip += __v.iov_len;			\
		n -= __v.iov_len;			\
	}						\
	while (unlikely(n)) {				\
		__p++;					\
		__v.iov_len = min(n, __p->iov_len);	\
		if (unlikely(!__v.iov_len))		\
			continue;			\
		__v.iov_base = __p->iov_base;		\
		(void)(STEP);				\
		skip = __v.iov_len;			\
		n -= __v.iov_len;			\
	}						\
	n = wanted;					\
}

#define iterate_and_advance(i, n, v, I, B, K) {			\
	if (unlikely(i->count < n))				\
		n = i->count;					\
	if (i->count) {						\
		size_t skip = i->iov_offset;			\
			const struct kvec *kvec;		\
			struct kvec v;				\
			iterate_kvec(i, n, v, kvec, skip, (K))	\
			if (skip == kvec->iov_len) {		\
				kvec++;				\
				skip = 0;			\
			}					\
			i->nr_segs -= kvec - i->kvec;		\
			i->kvec = kvec;				\
		i->count -= n;					\
		i->iov_offset = skip;				\
	}							\
}

static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
			      __wsum sum, size_t off)
{
#ifdef dsl_patch
	return csum_partial_copy_nocheck(from, to, len, sum);
#else
	__wsum next = csum_partial_copy_nocheck(from, to, len, 0);
	return csum_block_add(sum, next, off);
#endif
}

size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
			       struct iov_iter *i)
{
	char *to = addr;
	__wsum sum, next;
	size_t off = 0;
	sum = *csum;
	iterate_and_advance(i, bytes, v, , ,({
		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
				      v.iov_base, v.iov_len,
				      sum, off);
		off += v.iov_len;
#ifdef dsl_patch
		if (v.iov_len & 1)
			sum = __builtin_bswap32(sum);
#endif

	})
	)
#ifdef dsl_patch
	if (off & 1)
		sum = __builtin_bswap32(sum);
#endif
	*csum = sum;
	return bytes;
}

//////////////////////////////////////////////////////////////////////

void ex_handler_uaccess(void) { }
void ex_handler_default(void) { }

static char data[65536] = {

0x46,0x56,0x20,0x04,0x00,0x02,0x00,0x00,0x72,0x4d,0xc6,0x3d,0x31,0x85,0x2d,0xbd,
0xe2,0xe0,0x9d,0x3e,0x3b,0x7a,0x70,0x3d,0xd2,0xfb,0x8c,0xbf,0x95,0x10,0xa9,0xbe,
0xeb,0xfd,0x29,0x40,0xd5,0x7a,0x61,0x40,0xde,0xcd,0x14,0xbf,0x81,0x1b,0xf6,0x3f,
0xbc,0xff,0x17,0x3f,0x67,0x1c,0x6e,0xbe,0xf4,0xc2,0x05,0x40,0x0b,0x13,0x78,0x3f,
0xfe,0x47,0xa7,0xbd,0x59,0xc2,0x15,0x3f,0x07,0xd0,0xea,0xbf,0x97,0xf1,0x3c,0x3f,
0xcc,0xfa,0x6b,0x40,0x72,0x6a,0x4f,0xbe,0x0b,0xe3,0x75,0x3e,0x3c,0x9b,0x0e,0xbf,
0xa9,0xeb,0xb7,0x3f,0xeb,0x4a,0xec,0x3e,0x33,0x8c,0x0c,0x3f,0x6a,0xf2,0xf3,0x3e,
0x2b,0x45,0x86,0x3f,0x83,0xce,0x8a,0x3f,0xf6,0x01,0x16,0x40,0x9c,0x17,0x47,0x3e,
0x44,0x83,0x61,0x40,0x74,0xc7,0x5c,0x3f,0xec,0xe7,0x95,0x3f,0xee,0x19,0xb5,0xbf,
0xb5,0xf0,0x03,0xbf,0xd1,0x02,0x1c,0x3e,0xa3,0x55,0x90,0xbe,0x1e,0x0b,0xa1,0xbf,
0xa4,0xa8,0xb4,0x3f,0xc6,0x68,0x91,0x3f,0xd1,0xc5,0xab,0x3f,0xb9,0x14,0x62,0x3f,
0x7c,0xe0,0xb9,0xbf,0xc0,0xa4,0xb5,0x3d,0x6f,0xd9,0xa7,0x3f,0x8f,0xc4,0xb0,0x3d,
0x48,0x2c,0x7a,0x3e,0x83,0xb2,0x3c,0x40,0x36,0xd3,0x18,0x40,0xb7,0xa9,0x57,0x40,
0xda,0xd3,0x95,0x3f,0x74,0x95,0xc0,0xbe,0xbb,0xce,0x71,0x3e,0x95,0xec,0x18,0xbf,
0x94,0x17,0xdd,0x3f,0x98,0xa5,0x02,0x3f,0xbb,0xfb,0xbb,0x3e,0xd0,0x5a,0x9c,0x3f,
0xd4,0x00,0x9b,0xbf,0x3b,0x9f,0x20,0xc0,0x84,0x5b,0x0f,0x40,0x5e,0x48,0x2c,0xbf,

};

#if 0
struct kvec {
	size_t iov_len;
	void   *iov_base;
};

struct iov_iter {
	unsigned int count;
	unsigned int nr_segs;
	const struct kvec *kvec;
	size_t       iov_offset;
};
#endif

static inline unsigned int rdpmc(unsigned int counter)
{
	unsigned int low, high;

	asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));

	// return low bits, counter might to 32 or 40 bits wide.
	return low;
}

unsigned int read_cpu_cycles(void)
{
	static struct perf_event_attr perf_attr = {
		.type = PERF_TYPE_HARDWARE,
		.config = PERF_COUNT_HW_CPU_CYCLES,
		// .config = PERF_COUNT_HW_INSTRUCTIONS,
		.pinned = 1,
	};
	static struct perf_event_mmap_page *pc;
	unsigned int seq, idx, count;

	if (!pc) {
		int perf_fd;
		perf_fd = syscall(__NR_perf_event_open, &perf_attr, 0, -1, -1, 0);
		if (perf_fd < 0) {
			fprintf(stderr, "perf_event_open failed: errno %d\n", errno);
			exit(1);
		}
		pc = mmap(NULL, 4096, PROT_READ, MAP_SHARED, perf_fd, 0);
		if (pc == MAP_FAILED) {
			fprintf(stderr, "perf_event mmap() failed: errno %d\n", errno);
			exit(1);
		}
	}

	do {
		seq = pc->lock;
		asm volatile("":::"memory");
		idx = pc->index;
		if (!idx) //  || !pc->cap_user_rdpmc)
			return 0;
		count = pc->offset + rdpmc(idx - 1);
		asm volatile("":::"memory");
	} while (pc->lock != seq);

	return count;
}

static int target[16 * sizeof data / 4];

#define PASSES 16
int main(int argc, char **argv)
{
	struct kvec kvec[16];
	struct iov_iter i;
	int len;
	unsigned int clocks[PASSES];
	__wsum csum[PASSES] = {};
	unsigned int pass;
	unsigned int frag_len;

	read_cpu_cycles();
	clocks[0] = read_cpu_cycles();

	frag_len = argv[1] ? atoi(argv[1]) : 0;
	if (!frag_len || frag_len > sizeof data)
		frag_len = sizeof data;

	for (pass = 1; pass < PASSES; pass++) {
		/* Sum the same data 16 times */
		i.count = frag_len * 16;
		i.nr_segs = 16;
		i.kvec = kvec;
		i.iov_offset = 0;

		for (len = 0; len < 16; len++) {
			kvec[len].iov_len = frag_len;
			kvec[len].iov_base = data;
		}
		csum_and_copy_from_iter(target, i.count, csum + pass, &i);
		clocks[pass] = read_cpu_cycles();
	}
	for (pass = 1; pass < PASSES; pass++) {
		unsigned int delta = clocks[pass] - clocks[pass - 1];
		printf("pass %d: length %d, csum %x, clocks %d, clocks/word %5f\n",
			pass, frag_len * 16, fold(csum[pass]), delta, delta / (frag_len * 16/8 + 0.0));
	}

	return 0;
}