On SandyBridge, the BLT commands were split from the RENDER commands as well as the BSD split inherited from Ironlake. So we need to make sure we do exercise each ring, and in order to do so we also need to make sure each batch takes longer to execute than it takes for us to submit it. Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk> --- lib/intel_gpu_tools.h | 5 ++ lib/rendercopy.h | 5 ++ tests/gem_ringfill.c | 174 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 131 insertions(+), 53 deletions(-) diff --git a/lib/intel_gpu_tools.h b/lib/intel_gpu_tools.h index f46abfa..eb21a16 100644 --- a/lib/intel_gpu_tools.h +++ b/lib/intel_gpu_tools.h @@ -25,6 +25,9 @@ * */ +#ifndef INTEL_GPU_TOOLS_H +#define INTEL_GPU_TOOLS_H + #include <stdint.h> #include <sys/types.h> #include <pciaccess.h> @@ -94,3 +97,5 @@ extern enum pch_type pch; void intel_check_pch(void); #define HAS_CPT (pch == PCH_CPT) + +#endif /* INTEL_GPU_TOOLS_H */ diff --git a/lib/rendercopy.h b/lib/rendercopy.h index 7547ac4..4fcc817 100644 --- a/lib/rendercopy.h +++ b/lib/rendercopy.h @@ -58,6 +58,11 @@ static inline unsigned buf_height(struct scratch_buf *buf) return buf->size/buf->stride; } +typedef void (*render_copyfunc_t)(struct intel_batchbuffer *batch, + struct scratch_buf *src, unsigned src_x, unsigned src_y, + unsigned width, unsigned height, + struct scratch_buf *dst, unsigned dst_x, unsigned dst_y); + void gen6_render_copyfunc(struct intel_batchbuffer *batch, struct scratch_buf *src, unsigned src_x, unsigned src_y, unsigned width, unsigned height, diff --git a/tests/gem_ringfill.c b/tests/gem_ringfill.c index 685a010..922ef87 100644 --- a/tests/gem_ringfill.c +++ b/tests/gem_ringfill.c @@ -41,26 +41,135 @@ #include <errno.h> #include <sys/stat.h> #include <sys/time.h> + #include "drm.h" #include "i915_drm.h" #include "drmtest.h" #include "intel_bufmgr.h" #include "intel_batchbuffer.h" #include "intel_gpu_tools.h" +#include "rendercopy.h" + +struct bo { + const char *ring; + drm_intel_bo *src, *dst, *tmp; +}; static drm_intel_bufmgr *bufmgr; -struct intel_batchbuffer *batch; +static struct intel_batchbuffer *batch; static const int width = 512, height = 512; +static void blt_copy(struct bo *b, int x, int y) +{ + /* Dummy load to fill the ring */ + intel_copy_bo(batch, b->tmp, b->src, width, height); + + /* And copy the src into dst, pixel by pixel, for verification */ + BEGIN_BATCH(8); + OUT_BATCH(XY_SRC_COPY_BLT_CMD | + XY_SRC_COPY_BLT_WRITE_ALPHA | + XY_SRC_COPY_BLT_WRITE_RGB); + OUT_BATCH((3 << 24) | /* 32 bits */ + (0xcc << 16) | /* copy ROP */ + 4 * width); + OUT_BATCH((y << 16) | x); /* dst x1,y1 */ + OUT_BATCH(((y + 1) << 16) | (x + 1)); /* dst x2,y2 */ + OUT_RELOC(b->dst, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); + OUT_BATCH((y << 16) | x); /* src x1,y1 */ + OUT_BATCH(4 * width); + OUT_RELOC(b->src, I915_GEM_DOMAIN_RENDER, 0, 0); + ADVANCE_BATCH(); + + intel_batchbuffer_flush(batch); +} + +static void render_copy(struct bo *b, int x, int y) +{ + struct scratch_buf src, tmp, dst; + render_copyfunc_t copy; + + /* Strictly only required on architectures with a separate BLT ring, + * but lets stress everybody. + */ + copy = NULL; + if (IS_GEN2(batch->devid)) + copy = gen2_render_copyfunc; + else if (IS_GEN3(batch->devid)) + copy = gen3_render_copyfunc; + else if (IS_GEN6(batch->devid)) + copy = gen6_render_copyfunc; + if (copy == NULL) + return; + + src.stride = 4 * width; + src.tiling = 0; + src.data = src.cpu_mapping = NULL; + src.size = 4 * width * height; + src.num_tiles = 4 * width * height; + dst = tmp = src; + + src.bo = b->src; + tmp.bo = b->tmp; + dst.bo = b->dst; + + /* Dummy load to fill the ring */ + copy(batch, &src, 0, 0, width, height, &tmp, 0, 0); + /* And copy the src into dst, pixel by pixel, for verification */ + copy(batch, &src, x, y, 1, 1, &dst, x, y); +} + +static void create_bo(struct bo *b, const char *ring) +{ + int size = 4 * width * height, i; + uint32_t *map; + + b->ring = ring; + b->src = drm_intel_bo_alloc(bufmgr, "src", size, 4096); + b->dst = drm_intel_bo_alloc(bufmgr, "dst", size, 4096); + b->tmp = drm_intel_bo_alloc(bufmgr, "tmp", size, 4096); + + /* Fill the src with indexes of the pixels */ + drm_intel_bo_map(b->src, true); + map = b->src->virtual; + for (i = 0; i < width * height; i++) + map[i] = i; + drm_intel_bo_unmap(b->src); + + /* Fill the dst with garbage. */ + drm_intel_bo_map(b->dst, true); + map = b->dst->virtual; + for (i = 0; i < width * height; i++) + map[i] = 0xd0d0d0d0; + drm_intel_bo_unmap(b->dst); +} + +static int check_bo(struct bo *b) +{ + const uint32_t *map; + int i, fails = 0; + + drm_intel_bo_map(b->dst, false); + map = b->dst->virtual; + for (i = 0; i < width*height; i++) { + if (map[i] != i && ++fails <= 9) { + int x = i % width; + int y = i / width; + + printf("%s: copy #%d at %d,%d failed: read 0x%08x\n", + b->ring, i, x, y, map[i]); + } + } + drm_intel_bo_unmap(b->dst); + + return fails; +} + int main(int argc, char **argv) { + struct bo render, blt; int fd; int i; - drm_intel_bo *src_bo, *dst_bo; - uint32_t *map; int fails = 0; - int pitch = width * 4; - int size = pitch * height; int blits; fd = drm_open_any(); @@ -69,22 +178,8 @@ int main(int argc, char **argv) drm_intel_bufmgr_gem_enable_reuse(bufmgr); batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd)); - src_bo = drm_intel_bo_alloc(bufmgr, "src bo", size, 4096); - dst_bo = drm_intel_bo_alloc(bufmgr, "src bo", size, 4096); - - /* Fill the src with indexes of the pixels */ - drm_intel_bo_map(src_bo, true); - map = src_bo->virtual; - for (i = 0; i < width * height; i++) - map[i] = i; - drm_intel_bo_unmap(src_bo); - - /* Fill the dst with garbage. */ - drm_intel_bo_map(dst_bo, true); - map = dst_bo->virtual; - for (i = 0; i < width * height; i++) - map[i] = 0xd0d0d0d0; - drm_intel_bo_unmap(dst_bo); + create_bo(&render, "render"); + create_bo(&blt, "blt"); /* The ring we've been using is 128k, and each rendering op * will use at least 8 dwords: @@ -108,41 +203,14 @@ int main(int argc, char **argv) assert(y < height); - BEGIN_BATCH(8); - OUT_BATCH(XY_SRC_COPY_BLT_CMD | - XY_SRC_COPY_BLT_WRITE_ALPHA | - XY_SRC_COPY_BLT_WRITE_RGB); - OUT_BATCH((3 << 24) | /* 32 bits */ - (0xcc << 16) | /* copy ROP */ - pitch); - OUT_BATCH((y << 16) | x); /* dst x1,y1 */ - OUT_BATCH(((y + 1) << 16) | (x + 1)); /* dst x2,y2 */ - OUT_RELOC(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); - OUT_BATCH((y << 16) | x); /* src x1,y1 */ - OUT_BATCH(pitch); - OUT_RELOC(src_bo, I915_GEM_DOMAIN_RENDER, 0, 0); - ADVANCE_BATCH(); - - intel_batchbuffer_flush(batch); + blt_copy(&blt, x, y); + render_copy(&render, x, y); } /* verify */ - drm_intel_bo_map(dst_bo, false); - map = dst_bo->virtual; - for (i = 0; i < blits; i++) { - int x = i % width; - int y = i / width; - - if (map[i] != i) { - - printf("Copy #%d at %d,%d failed: read 0x%08x\n", - i, x, y, map[i]); - - if (fails++ > 9) - exit(1); - } - } - drm_intel_bo_unmap(dst_bo); + fails = 0; + fails += check_bo(&blt); + fails += check_bo(&render); intel_batchbuffer_free(batch); drm_intel_bufmgr_destroy(bufmgr); -- 1.7.9