On 22/05/2018 12:00, Chris Wilson wrote:
While for stressing the system we want to submit as many batches as we
can as that shows us worst case impact on system latency, it is not a
very realistic case. To introduce a bit more realism allow the batches
run for a user defined duration.
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
benchmarks/gem_syslatency.c | 71 ++++++++++++++++++++++++++++++++++---
1 file changed, 67 insertions(+), 4 deletions(-)
diff --git a/benchmarks/gem_syslatency.c b/benchmarks/gem_syslatency.c
index d1056773a..45cabe86c 100644
--- a/benchmarks/gem_syslatency.c
+++ b/benchmarks/gem_syslatency.c
@@ -51,6 +51,7 @@ static volatile int done;
struct gem_busyspin {
pthread_t thread;
+ unsigned long sz;
unsigned long count;
bool leak;
bool interrupts;
@@ -96,7 +97,8 @@ static void *gem_busyspin(void *arg)
struct gem_busyspin *bs = arg;
struct drm_i915_gem_execbuffer2 execbuf;
struct drm_i915_gem_exec_object2 obj[2];
- const unsigned sz = bs->leak ? 16 << 20 : 4 << 10;
+ const unsigned sz =
+ bs->sz ? bs->sz + sizeof(bbe) : bs->leak ? 16 << 20 : 4 << 10;
unsigned engines[16];
unsigned nengine;
unsigned engine;
@@ -112,7 +114,7 @@ static void *gem_busyspin(void *arg)
obj[0].handle = gem_create(fd, 4096);
obj[0].flags = EXEC_OBJECT_WRITE;
obj[1].handle = gem_create(fd, sz);
- gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
+ gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe));
Hm what was the point in creating large batches here if bbend was always
first?
memset(&execbuf, 0, sizeof(execbuf));
execbuf.buffers_ptr = (uintptr_t)(obj + !bs->interrupts);
@@ -125,6 +127,12 @@ static void *gem_busyspin(void *arg)
}
while (!done) {
+ for (int n = 0; n < nengine; n++) {
+ const int m = rand() % nengine;
+ unsigned int tmp = engines[n];
+ engines[n] = engines[m];
+ engines[m] = tmp;
igt_exchange_int? Problem with frameworks getting more featureful is
easier to forget what is there. :) Or even igt_permute_array?
But what it has to do with batch duration?
+ }
for (int n = 0; n < nengine; n++) {
execbuf.flags &= ~ENGINE_FLAGS;
execbuf.flags |= engines[n];
@@ -134,7 +142,7 @@ static void *gem_busyspin(void *arg)
if (bs->leak) {
gem_madvise(fd, obj[1].handle, I915_MADV_DONTNEED);
obj[1].handle = gem_create(fd, sz);
- gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
+ gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe));
}
}
@@ -294,6 +302,50 @@ static void *background_fs(void *path)
return NULL;
}
+static unsigned long calibrate_nop(unsigned int target_us,
+ unsigned int tolerance_pct)
+{
+ const uint32_t bbe = MI_BATCH_BUFFER_END;
+ const unsigned int loops = 100;
+ struct drm_i915_gem_exec_object2 obj = {};
+ struct drm_i915_gem_execbuffer2 eb =
+ { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+ struct timespec t_0, t_end;
+ long sz, prev;
+ int fd;
+
+ fd = drm_open_driver(DRIVER_INTEL);
+
+ clock_gettime(CLOCK_MONOTONIC, &t_0);
+
+ sz = 256 * 1024;
+ do {
+ struct timespec t_start;
+
+ obj.handle = gem_create(fd, sz + sizeof(bbe));
+ gem_write(fd, obj.handle, sz, &bbe, sizeof(bbe));
+ gem_execbuf(fd, &eb);
+ gem_sync(fd, obj.handle);
+
+ clock_gettime(CLOCK_MONOTONIC, &t_start);
+ for (int loop = 0; loop < loops; loop++)
+ gem_execbuf(fd, &eb);
+ gem_sync(fd, obj.handle);
+ clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+ gem_close(fd, obj.handle);
+
+ prev = sz;
+ sz = loops * sz / elapsed(&t_start, &t_end) * 1e3 * target_us;
+ sz = ALIGN(sz, sizeof(uint32_t));
+ } while (elapsed(&t_0, &t_end) < 5 ||
+ abs(sz - prev) > (sz * tolerance_pct / 100));
+
+ close(fd);
+
+ return sz;
+}
I presume this is a copy&paste so don't have to look into it in detail.
+
int main(int argc, char **argv)
{
struct gem_busyspin *busy;
@@ -309,9 +361,10 @@ int main(int argc, char **argv)
int enable_gem_sysbusy = 1;
bool leak = false;
bool interrupts = false;
+ long batch = 0;
int n, c;
- while ((c = getopt(argc, argv, "t:f:bmni1")) != -1) {
+ while ((c = getopt(argc, argv, "r:t:f:bmni1")) != -1) {
switch (c) {
case '1':
ncpus = 1;
@@ -328,6 +381,10 @@ int main(int argc, char **argv)
if (time < 0)
time = INT_MAX;
break;
+ case 'r':
+ /* Duration of each batch (microseconds) */
+ batch = atoi(optarg);
+ break;
case 'f':
/* Select an output field */
field = atoi(optarg);
@@ -350,11 +407,17 @@ int main(int argc, char **argv)
force_low_latency();
min = min_measurement_error();
+ if (batch > 0)
+ batch = calibrate_nop(batch, 2);
+ else
+ batch = -batch;
+
No idea of the purpose of this. User passes in negative on the cmd line?
But then calibration is missing.
busy = calloc(ncpus, sizeof(*busy));
pthread_attr_init(&attr);
if (enable_gem_sysbusy) {
for (n = 0; n < ncpus; n++) {
bind_cpu(&attr, n);
+ busy[n].sz = batch;
busy[n].leak = leak;
busy[n].interrupts = interrupts;
pthread_create(&busy[n].thread, &attr,
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx