RE: [PATCH v2] x86: bring back rep movsq for user access on CPUs without ERMS

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Mateusz Guzik
> Sent: 10 September 2023 11:54
> 
> On 9/3/23, David Laight <David.Laight@xxxxxxxxxx> wrote:
> > ...
> >> When I was playing with this stuff about 5 years ago I found 32-byte
> >> loops to be optimal for uarchs of the priod (Skylake, Broadwell,
> >> Haswell and so on), but only up to a point where rep wins.
> >
> > Does the 'rep movsq' ever actually win?
> > (Unless you find one of the EMRS (or similar) versions.)
> > IIRC it only ever does one iteration per clock - and you
> > should be able to match that with a carefully constructed loop.
> >
> 
> Sorry for late reply, I missed your e-mail due to all the unrelated
> traffic in the thread and using gmail client. ;)
> 
> I am somewhat confused by the question though. In this very patch I'm
> showing numbers from an ERMS-less uarch getting a win from switching
> from hand-rolled mov loop to rep movsq, while doing 4KB copies.

I've just dome some measurements on an i7-7700.
That does have ERMS (fast 'rep movsb') but shows some interesting info.

The overhead of 'rep movbs' is about 36 clocks, 'rep movsq' only 16.
(except it has just changed its mind...)
'rep movsb' will copy (about) 32 bytes/clock provided the
destination buffer is 32byte aligned, but only 16 bytes/clock
otherwise. The source buffer alignment doesn't seem to matter.

On this system 'rep movsq' seems to behave the same way.

So that is faster than an copy loop - limited to one register
write per clock.

Test program attached.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>

#include <linux/perf_event.h>
#include <sys/mman.h>
#include <sys/syscall.h>

static int init_pmc(void)
{
	static struct perf_event_attr perf_attr = {
		.type = PERF_TYPE_HARDWARE,
		.config = PERF_COUNT_HW_CPU_CYCLES,
		.pinned = 1,
	};
	struct perf_event_mmap_page *pc;

	int perf_fd;
	perf_fd = syscall(__NR_perf_event_open, &perf_attr, 0, -1, -1, 0);
	if (perf_fd < 0) {
		fprintf(stderr, "perf_event_open failed: errno %d\n", errno);
		exit(1);
	}
	pc = mmap(NULL, 4096, PROT_READ, MAP_SHARED, perf_fd, 0);
	if (pc == MAP_FAILED) {
		fprintf(stderr, "perf_event mmap() failed: errno %d\n", errno);
		exit(1);
	}
	return pc->index - 1;
}

static inline unsigned int rdpmc(unsigned int counter)
{
        unsigned int low, high;

	// asm volatile("rdtsc" : "=a" (low), "=d" (high));
	asm volatile("lfence");
	asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));

	// return low bits, counter might to 32 or 40 bits wide.
	return low;
}

#ifndef MODE
#define MODE 0
#endif

__attribute__((noinline))
void memcpy_perf(unsigned char *d_buff, const unsigned char *s_buff, unsigned long len)
{

#if MODE == -1
// 'No copy' loop for baseline overhead
	asm volatile("	nop\n"
		: "+&D" (d_buff),  "+&S" (s_buff),  "+&c" (len)
		: : "memory");
#endif

#if MODE == 0
// Simple 'rep movs' loop
	asm volatile("	rep movsb\n"
		: "+&D" (d_buff),  "+&S" (s_buff),  "+&c" (len)
		: : "memory");
#endif

#if MODE == 1
// Simple 'rep movq' loop
	len /= 8;
	asm volatile("	rep movsq\n"
		: "+&D" (d_buff),  "+&S" (s_buff),  "+&c" (len)
		: : "memory");
#endif

}

unsigned char s_buff[8192] __attribute__((aligned(4096)));
unsigned char d_buff[8192 + 1] __attribute__((aligned(4096)));

#ifndef PASSES
#define PASSES 5
#endif

#ifndef OFFSET
#define OFFSET 0
#endif

int main(int argc, char **argv)
{
	unsigned int tick;
	unsigned int ticks[PASSES];
	unsigned int len, s_off = 0, d_off = 0;
	unsigned int i;
	unsigned int id = init_pmc();
	unsigned int offset = OFFSET;

	len = sizeof s_buff;
	for (;;) {
		switch (getopt(argc, argv, "l:s:d:o:")) {
		case -1:
			break;
		default:
			exit(1);
		case 'l': len = atoi(optarg); continue;
		case 's': s_off = atoi(optarg); continue;
		case 'd': d_off = atoi(optarg); continue;
		case 'o': offset = atoi(optarg); continue;
		}
		break;
	}

	if (s_off + len > sizeof s_buff || d_off + len > sizeof d_buff - 1) {
		fprintf(stderr, "too long\n");
		exit(1);
	}

	for (i = 0; i < len; i++)
		s_buff[i] = rand();

	for (i = 0; i < PASSES; i++) {
		tick = rdpmc(id);
		memcpy_perf(d_buff + d_off, s_buff + s_off, len);
		ticks[i] = rdpmc(id) - tick;
	}

	printf("   ticks    rate mode %d\n", MODE);
	for (i = 0; i < PASSES; i++)
		printf(" %7u %7u\n", ticks[i], 100 * len / (ticks[i] - offset));

	if (memcmp(d_buff + d_off, s_buff + s_off, len) || d_buff[d_off + len]) {
		fprintf(stderr, "copy mismatch\n");
		exit(1);
	}
	return 0;
}


[Index of Archives]     [Linux Kernel]     [Kernel Newbies]     [x86 Platform Driver]     [Netdev]     [Linux Wireless]     [Netfilter]     [Bugtraq]     [Linux Filesystems]     [Yosemite Discussion]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]

  Powered by Linux