From: Al Viro > Sent: 22 July 2020 18:39 > I would love to see your patch, anyway, along with the testcases and performance > comparison. See attached program. Compile and run (as root): csum_iov 1 Unpatched (as shipped) 16 vectors of 1 byte take ~430 clocks on my haswell cpu. With dsl_patch defined they take ~393. The maximum throughput is ~1.16 clocks/word for 16 vectors of 1k. For longer vectors the data gets lost from the cache between the iterations. On an older Ivy Bridge cpu it never goes faster than 2 clocks/word. (Due to the implementation of ADC.) The absolute limit is 1 clock/word - limited by the memory write. I suspect that is achievable on Haswell with much less loop unrolling. I had to replace the ror32() with __builtin_bswap32(). The kernel object do contain the 'ror' instruction - even though I didn't find the asm for it. David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
/* Test program for checksum+copy * * Executes csum_and_copy_from_iter() in userspace. * Uses PERF_COUNT_HW_CPU_CYCLES to see how fast it runs. * Always copies i6 copies of the same buffer to the target. * Length of each fragment taken from argv[0]. * * It needs linking with a copy of csum-copy_64.o (eg from a kernel build). * * For large buffers the 'adc' loop dominates. * On anything prior to Haswell this is 2 clocks per adc. * On Haswell adc is faster and it seems to approach 1.16 clocks/word. * It ought to be possibly to get to 1 clock/word on Ivy bridge (Sandy?) * or later. */ // define for my version // #define dsl_patch #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <errno.h> #include <unistd.h> #include <linux/perf_event.h> #include <sys/mman.h> #include <sys/syscall.h> #define likely(x) (x) #define unlikely(x) (x) typedef uint32_t __wsum; struct kvec { size_t iov_len; void *iov_base; }; struct iov_iter { unsigned int count; unsigned int nr_segs; const struct kvec *kvec; size_t iov_offset; }; #define min(a,b) ((a) < (b) ? (a) : (b)) static unsigned short fold(unsigned int csum) { csum = (csum & 0xffff) + (csum >> 16); return csum + (csum >> 16); } extern __wsum csum_partial_copy_generic(const void *, void *, size_t, __wsum, void *, void *); __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) { return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); } static inline unsigned add32_with_carry(unsigned a, unsigned b) { asm("addl %2,%0\n\t" "adcl $0,%0" : "=r" (a) : "0" (a), "rm" (b)); return a; } static inline __wsum csum_add(__wsum csum, __wsum addend) { return add32_with_carry(csum, addend); } static inline __wsum csum_block_add(__wsum csum, __wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ if (offset & 1) sum = __builtin_bswap32(sum); return csum_add(csum, sum); } ////////////////////////////////////////////////////////////////////// /* Necessary bits from iov_iter.c */ #define iterate_kvec(i, n, __v, __p, skip, STEP) { \ size_t wanted = n; \ __p = i->kvec; \ __v.iov_len = min(n, __p->iov_len - skip); \ if (likely(__v.iov_len)) { \ __v.iov_base = __p->iov_base + skip; \ (void)(STEP); \ skip += __v.iov_len; \ n -= __v.iov_len; \ } \ while (unlikely(n)) { \ __p++; \ __v.iov_len = min(n, __p->iov_len); \ if (unlikely(!__v.iov_len)) \ continue; \ __v.iov_base = __p->iov_base; \ (void)(STEP); \ skip = __v.iov_len; \ n -= __v.iov_len; \ } \ n = wanted; \ } #define iterate_and_advance(i, n, v, I, B, K) { \ if (unlikely(i->count < n)) \ n = i->count; \ if (i->count) { \ size_t skip = i->iov_offset; \ const struct kvec *kvec; \ struct kvec v; \ iterate_kvec(i, n, v, kvec, skip, (K)) \ if (skip == kvec->iov_len) { \ kvec++; \ skip = 0; \ } \ i->nr_segs -= kvec - i->kvec; \ i->kvec = kvec; \ i->count -= n; \ i->iov_offset = skip; \ } \ } static __wsum csum_and_memcpy(void *to, const void *from, size_t len, __wsum sum, size_t off) { #ifdef dsl_patch return csum_partial_copy_nocheck(from, to, len, sum); #else __wsum next = csum_partial_copy_nocheck(from, to, len, 0); return csum_block_add(sum, next, off); #endif } size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { char *to = addr; __wsum sum, next; size_t off = 0; sum = *csum; iterate_and_advance(i, bytes, v, , ,({ sum = csum_and_memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len, sum, off); off += v.iov_len; #ifdef dsl_patch if (v.iov_len & 1) sum = __builtin_bswap32(sum); #endif }) ) #ifdef dsl_patch if (off & 1) sum = __builtin_bswap32(sum); #endif *csum = sum; return bytes; } ////////////////////////////////////////////////////////////////////// void ex_handler_uaccess(void) { } void ex_handler_default(void) { } static char data[65536] = { 0x46,0x56,0x20,0x04,0x00,0x02,0x00,0x00,0x72,0x4d,0xc6,0x3d,0x31,0x85,0x2d,0xbd, 0xe2,0xe0,0x9d,0x3e,0x3b,0x7a,0x70,0x3d,0xd2,0xfb,0x8c,0xbf,0x95,0x10,0xa9,0xbe, 0xeb,0xfd,0x29,0x40,0xd5,0x7a,0x61,0x40,0xde,0xcd,0x14,0xbf,0x81,0x1b,0xf6,0x3f, 0xbc,0xff,0x17,0x3f,0x67,0x1c,0x6e,0xbe,0xf4,0xc2,0x05,0x40,0x0b,0x13,0x78,0x3f, 0xfe,0x47,0xa7,0xbd,0x59,0xc2,0x15,0x3f,0x07,0xd0,0xea,0xbf,0x97,0xf1,0x3c,0x3f, 0xcc,0xfa,0x6b,0x40,0x72,0x6a,0x4f,0xbe,0x0b,0xe3,0x75,0x3e,0x3c,0x9b,0x0e,0xbf, 0xa9,0xeb,0xb7,0x3f,0xeb,0x4a,0xec,0x3e,0x33,0x8c,0x0c,0x3f,0x6a,0xf2,0xf3,0x3e, 0x2b,0x45,0x86,0x3f,0x83,0xce,0x8a,0x3f,0xf6,0x01,0x16,0x40,0x9c,0x17,0x47,0x3e, 0x44,0x83,0x61,0x40,0x74,0xc7,0x5c,0x3f,0xec,0xe7,0x95,0x3f,0xee,0x19,0xb5,0xbf, 0xb5,0xf0,0x03,0xbf,0xd1,0x02,0x1c,0x3e,0xa3,0x55,0x90,0xbe,0x1e,0x0b,0xa1,0xbf, 0xa4,0xa8,0xb4,0x3f,0xc6,0x68,0x91,0x3f,0xd1,0xc5,0xab,0x3f,0xb9,0x14,0x62,0x3f, 0x7c,0xe0,0xb9,0xbf,0xc0,0xa4,0xb5,0x3d,0x6f,0xd9,0xa7,0x3f,0x8f,0xc4,0xb0,0x3d, 0x48,0x2c,0x7a,0x3e,0x83,0xb2,0x3c,0x40,0x36,0xd3,0x18,0x40,0xb7,0xa9,0x57,0x40, 0xda,0xd3,0x95,0x3f,0x74,0x95,0xc0,0xbe,0xbb,0xce,0x71,0x3e,0x95,0xec,0x18,0xbf, 0x94,0x17,0xdd,0x3f,0x98,0xa5,0x02,0x3f,0xbb,0xfb,0xbb,0x3e,0xd0,0x5a,0x9c,0x3f, 0xd4,0x00,0x9b,0xbf,0x3b,0x9f,0x20,0xc0,0x84,0x5b,0x0f,0x40,0x5e,0x48,0x2c,0xbf, }; #if 0 struct kvec { size_t iov_len; void *iov_base; }; struct iov_iter { unsigned int count; unsigned int nr_segs; const struct kvec *kvec; size_t iov_offset; }; #endif static inline unsigned int rdpmc(unsigned int counter) { unsigned int low, high; asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter)); // return low bits, counter might to 32 or 40 bits wide. return low; } unsigned int read_cpu_cycles(void) { static struct perf_event_attr perf_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, // .config = PERF_COUNT_HW_INSTRUCTIONS, .pinned = 1, }; static struct perf_event_mmap_page *pc; unsigned int seq, idx, count; if (!pc) { int perf_fd; perf_fd = syscall(__NR_perf_event_open, &perf_attr, 0, -1, -1, 0); if (perf_fd < 0) { fprintf(stderr, "perf_event_open failed: errno %d\n", errno); exit(1); } pc = mmap(NULL, 4096, PROT_READ, MAP_SHARED, perf_fd, 0); if (pc == MAP_FAILED) { fprintf(stderr, "perf_event mmap() failed: errno %d\n", errno); exit(1); } } do { seq = pc->lock; asm volatile("":::"memory"); idx = pc->index; if (!idx) // || !pc->cap_user_rdpmc) return 0; count = pc->offset + rdpmc(idx - 1); asm volatile("":::"memory"); } while (pc->lock != seq); return count; } static int target[16 * sizeof data / 4]; #define PASSES 16 int main(int argc, char **argv) { struct kvec kvec[16]; struct iov_iter i; int len; unsigned int clocks[PASSES]; __wsum csum[PASSES] = {}; unsigned int pass; unsigned int frag_len; read_cpu_cycles(); clocks[0] = read_cpu_cycles(); frag_len = argv[1] ? atoi(argv[1]) : 0; if (!frag_len || frag_len > sizeof data) frag_len = sizeof data; for (pass = 1; pass < PASSES; pass++) { /* Sum the same data 16 times */ i.count = frag_len * 16; i.nr_segs = 16; i.kvec = kvec; i.iov_offset = 0; for (len = 0; len < 16; len++) { kvec[len].iov_len = frag_len; kvec[len].iov_base = data; } csum_and_copy_from_iter(target, i.count, csum + pass, &i); clocks[pass] = read_cpu_cycles(); } for (pass = 1; pass < PASSES; pass++) { unsigned int delta = clocks[pass] - clocks[pass - 1]; printf("pass %d: length %d, csum %x, clocks %d, clocks/word %5f\n", pass, frag_len * 16, fold(csum[pass]), delta, delta / (frag_len * 16/8 + 0.0)); } return 0; }