Re: [PATCH i-g-t 3/3] benchmarks/gem_syslatency: Specify batch duration

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Tue, 22 May 2018 12:49:52 +0100

On 22/05/2018 12:00, Chris Wilson wrote:
While for stressing the system we want to submit as many batches as we
can as that shows us worst case impact on system latency, it is not a
very realistic case. To introduce a bit more realism allow the batches
run for a user defined duration.
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
  benchmarks/gem_syslatency.c | 71 ++++++++++++++++++++++++++++++++++---
  1 file changed, 67 insertions(+), 4 deletions(-)

diff --git a/benchmarks/gem_syslatency.c b/benchmarks/gem_syslatency.c
index d1056773a..45cabe86c 100644
--- a/benchmarks/gem_syslatency.c
+++ b/benchmarks/gem_syslatency.c
@@ -51,6 +51,7 @@ static volatile int done;
  
  struct gem_busyspin {
  	pthread_t thread;
+	unsigned long sz;
  	unsigned long count;
  	bool leak;
  	bool interrupts;
@@ -96,7 +97,8 @@ static void *gem_busyspin(void *arg)
  	struct gem_busyspin *bs = arg;
  	struct drm_i915_gem_execbuffer2 execbuf;
  	struct drm_i915_gem_exec_object2 obj[2];
-	const unsigned sz = bs->leak ? 16 << 20 : 4 << 10;
+	const unsigned sz =
+		bs->sz ? bs->sz + sizeof(bbe) : bs->leak ? 16 << 20 : 4 << 10;
  	unsigned engines[16];
  	unsigned nengine;
  	unsigned engine;
@@ -112,7 +114,7 @@ static void *gem_busyspin(void *arg)
  	obj[0].handle = gem_create(fd, 4096);
  	obj[0].flags = EXEC_OBJECT_WRITE;
  	obj[1].handle = gem_create(fd, sz);
-	gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
+	gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe));

Hm what was the point in creating large batches here if bbend was always 
first?

  
  	memset(&execbuf, 0, sizeof(execbuf));
  	execbuf.buffers_ptr = (uintptr_t)(obj + !bs->interrupts);
@@ -125,6 +127,12 @@ static void *gem_busyspin(void *arg)
  	}
  
  	while (!done) {
+		for (int n = 0; n < nengine; n++) {
+			const int m = rand() % nengine;
+			unsigned int tmp = engines[n];
+			engines[n] = engines[m];
+			engines[m] = tmp;

igt_exchange_int? Problem with frameworks getting more featureful is 
easier to forget what is there. :) Or even igt_permute_array?

But what it has to do with batch duration?

+		}
  		for (int n = 0; n < nengine; n++) {
  			execbuf.flags &= ~ENGINE_FLAGS;
  			execbuf.flags |= engines[n];
@@ -134,7 +142,7 @@ static void *gem_busyspin(void *arg)
  		if (bs->leak) {
  			gem_madvise(fd, obj[1].handle, I915_MADV_DONTNEED);
  			obj[1].handle = gem_create(fd, sz);
-			gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
+			gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe));
  		}
  	}
  
@@ -294,6 +302,50 @@ static void *background_fs(void *path)
  	return NULL;
  }
  
+static unsigned long calibrate_nop(unsigned int target_us,
+				   unsigned int tolerance_pct)
+{
+	const uint32_t bbe = MI_BATCH_BUFFER_END;
+	const unsigned int loops = 100;
+	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_execbuffer2 eb =
+		{ .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+	struct timespec t_0, t_end;
+	long sz, prev;
+	int fd;
+
+	fd = drm_open_driver(DRIVER_INTEL);
+
+	clock_gettime(CLOCK_MONOTONIC, &t_0);
+
+	sz = 256 * 1024;
+	do {
+		struct timespec t_start;
+
+		obj.handle = gem_create(fd, sz + sizeof(bbe));
+		gem_write(fd, obj.handle, sz, &bbe, sizeof(bbe));
+		gem_execbuf(fd, &eb);
+		gem_sync(fd, obj.handle);
+
+		clock_gettime(CLOCK_MONOTONIC, &t_start);
+		for (int loop = 0; loop < loops; loop++)
+			gem_execbuf(fd, &eb);
+		gem_sync(fd, obj.handle);
+		clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+		gem_close(fd, obj.handle);
+
+		prev = sz;
+		sz = loops * sz / elapsed(&t_start, &t_end) * 1e3 * target_us;
+		sz = ALIGN(sz, sizeof(uint32_t));
+	} while (elapsed(&t_0, &t_end) < 5 ||
+		 abs(sz - prev) > (sz * tolerance_pct / 100));
+
+	close(fd);
+
+	return sz;
+}

I presume this is a copy&paste so don't have to look into it in detail.

+
  int main(int argc, char **argv)
  {
  	struct gem_busyspin *busy;
@@ -309,9 +361,10 @@ int main(int argc, char **argv)
  	int enable_gem_sysbusy = 1;
  	bool leak = false;
  	bool interrupts = false;
+	long batch = 0;
  	int n, c;
  
-	while ((c = getopt(argc, argv, "t:f:bmni1")) != -1) {
+	while ((c = getopt(argc, argv, "r:t:f:bmni1")) != -1) {
  		switch (c) {
  		case '1':
  			ncpus = 1;
@@ -328,6 +381,10 @@ int main(int argc, char **argv)
  			if (time < 0)
  				time = INT_MAX;
  			break;
+		case 'r':
+			/* Duration of each batch (microseconds) */
+			batch = atoi(optarg);
+			break;
  		case 'f':
  			/* Select an output field */
  			field = atoi(optarg);
@@ -350,11 +407,17 @@ int main(int argc, char **argv)
  	force_low_latency();
  	min = min_measurement_error();
  
+	if (batch > 0)
+		batch = calibrate_nop(batch, 2);
+	else
+		batch = -batch;
+

No idea of the purpose of this. User passes in negative on the cmd line? 
But then calibration is missing.

  	busy = calloc(ncpus, sizeof(*busy));
  	pthread_attr_init(&attr);
  	if (enable_gem_sysbusy) {
  		for (n = 0; n < ncpus; n++) {
  			bind_cpu(&attr, n);
+			busy[n].sz = batch;
  			busy[n].leak = leak;
  			busy[n].interrupts = interrupts;
  			pthread_create(&busy[n].thread, &attr,


Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx