On SandyBridge, the BLT commands were split from the RENDER commands as well as the BSD split inherited from Ironlake. So we need to make sure we do exercise each ring, and in order to do so we also need to make sure each batch takes longer to execute than it takes for us to submit it. v2: Exercise each ring sequentially. Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk> --- lib/intel_gpu_tools.h | 5 ++ lib/rendercopy.h | 5 ++ tests/gem_ringfill.c | 185 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 140 insertions(+), 55 deletions(-) diff --git a/lib/intel_gpu_tools.h b/lib/intel_gpu_tools.h index f46abfa..eb21a16 100644 --- a/lib/intel_gpu_tools.h +++ b/lib/intel_gpu_tools.h @@ -25,6 +25,9 @@ * */ +#ifndef INTEL_GPU_TOOLS_H +#define INTEL_GPU_TOOLS_H + #include <stdint.h> #include <sys/types.h> #include <pciaccess.h> @@ -94,3 +97,5 @@ extern enum pch_type pch; void intel_check_pch(void); #define HAS_CPT (pch == PCH_CPT) + +#endif /* INTEL_GPU_TOOLS_H */ diff --git a/lib/rendercopy.h b/lib/rendercopy.h index 7547ac4..4fcc817 100644 --- a/lib/rendercopy.h +++ b/lib/rendercopy.h @@ -58,6 +58,11 @@ static inline unsigned buf_height(struct scratch_buf *buf) return buf->size/buf->stride; } +typedef void (*render_copyfunc_t)(struct intel_batchbuffer *batch, + struct scratch_buf *src, unsigned src_x, unsigned src_y, + unsigned width, unsigned height, + struct scratch_buf *dst, unsigned dst_x, unsigned dst_y); + void gen6_render_copyfunc(struct intel_batchbuffer *batch, struct scratch_buf *src, unsigned src_x, unsigned src_y, unsigned width, unsigned height, diff --git a/tests/gem_ringfill.c b/tests/gem_ringfill.c index 685a010..2d00f06 100644 --- a/tests/gem_ringfill.c +++ b/tests/gem_ringfill.c @@ -41,50 +41,98 @@ #include <errno.h> #include <sys/stat.h> #include <sys/time.h> + #include "drm.h" #include "i915_drm.h" #include "drmtest.h" #include "intel_bufmgr.h" #include "intel_batchbuffer.h" #include "intel_gpu_tools.h" +#include "rendercopy.h" + +struct bo { + const char *ring; + drm_intel_bo *src, *dst, *tmp; +}; -static drm_intel_bufmgr *bufmgr; -struct intel_batchbuffer *batch; static const int width = 512, height = 512; -int main(int argc, char **argv) +static void create_bo(drm_intel_bufmgr *bufmgr, + struct bo *b, + const char *ring) { - int fd; - int i; - drm_intel_bo *src_bo, *dst_bo; + int size = 4 * width * height, i; uint32_t *map; - int fails = 0; - int pitch = width * 4; - int size = pitch * height; - int blits; - fd = drm_open_any(); - - bufmgr = drm_intel_bufmgr_gem_init(fd, 4096); - drm_intel_bufmgr_gem_enable_reuse(bufmgr); - batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd)); - - src_bo = drm_intel_bo_alloc(bufmgr, "src bo", size, 4096); - dst_bo = drm_intel_bo_alloc(bufmgr, "src bo", size, 4096); + b->ring = ring; + b->src = drm_intel_bo_alloc(bufmgr, "src", size, 4096); + b->dst = drm_intel_bo_alloc(bufmgr, "dst", size, 4096); + b->tmp = drm_intel_bo_alloc(bufmgr, "tmp", size, 4096); /* Fill the src with indexes of the pixels */ - drm_intel_bo_map(src_bo, true); - map = src_bo->virtual; + drm_intel_bo_map(b->src, true); + map = b->src->virtual; for (i = 0; i < width * height; i++) map[i] = i; - drm_intel_bo_unmap(src_bo); + drm_intel_bo_unmap(b->src); /* Fill the dst with garbage. */ - drm_intel_bo_map(dst_bo, true); - map = dst_bo->virtual; + drm_intel_bo_map(b->dst, true); + map = b->dst->virtual; for (i = 0; i < width * height; i++) map[i] = 0xd0d0d0d0; - drm_intel_bo_unmap(dst_bo); + drm_intel_bo_unmap(b->dst); +} + +static int check_bo(struct bo *b) +{ + const uint32_t *map; + int i, fails = 0; + + drm_intel_bo_map(b->dst, false); + map = b->dst->virtual; + for (i = 0; i < width*height; i++) { + if (map[i] != i && ++fails <= 9) { + int x = i % width; + int y = i / width; + + printf("%s: copy #%d at %d,%d failed: read 0x%08x\n", + b->ring, i, x, y, map[i]); + } + } + drm_intel_bo_unmap(b->dst); + + return fails; +} + +static void destroy_bo(struct bo *b) +{ + drm_intel_bo_unreference(b->src); + drm_intel_bo_unreference(b->tmp); + drm_intel_bo_unreference(b->dst); +} + +static int check_ring(drm_intel_bufmgr *bufmgr, + struct intel_batchbuffer *batch, + const char *ring, + render_copyfunc_t copy) +{ + struct scratch_buf src, tmp, dst; + struct bo bo; + int i; + + create_bo(bufmgr, &bo, ring); + + src.stride = 4 * width; + src.tiling = 0; + src.data = src.cpu_mapping = NULL; + src.size = 4 * width * height; + src.num_tiles = 4 * width * height; + dst = tmp = src; + + src.bo = bo.src; + tmp.bo = bo.tmp; + dst.bo = bo.dst; /* The ring we've been using is 128k, and each rendering op * will use at least 8 dwords: @@ -101,48 +149,75 @@ int main(int argc, char **argv) * So iterate just a little more than that -- if we don't fill the ring * doing this, we aren't likely to with this test. */ - blits = width * height; - for (i = 0; i < blits; i++) { + for (i = 0; i < width * height; i++) { int x = i % width; int y = i / width; assert(y < height); - BEGIN_BATCH(8); - OUT_BATCH(XY_SRC_COPY_BLT_CMD | - XY_SRC_COPY_BLT_WRITE_ALPHA | - XY_SRC_COPY_BLT_WRITE_RGB); - OUT_BATCH((3 << 24) | /* 32 bits */ - (0xcc << 16) | /* copy ROP */ - pitch); - OUT_BATCH((y << 16) | x); /* dst x1,y1 */ - OUT_BATCH(((y + 1) << 16) | (x + 1)); /* dst x2,y2 */ - OUT_RELOC(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); - OUT_BATCH((y << 16) | x); /* src x1,y1 */ - OUT_BATCH(pitch); - OUT_RELOC(src_bo, I915_GEM_DOMAIN_RENDER, 0, 0); - ADVANCE_BATCH(); - - intel_batchbuffer_flush(batch); + /* Dummy load to fill the ring */ + copy(batch, &src, 0, 0, width, height, &tmp, 0, 0); + /* And copy the src into dst, pixel by pixel */ + copy(batch, &src, x, y, 1, 1, &dst, x, y); } /* verify */ - drm_intel_bo_map(dst_bo, false); - map = dst_bo->virtual; - for (i = 0; i < blits; i++) { - int x = i % width; - int y = i / width; + i = check_bo(&bo); + destroy_bo(&bo); - if (map[i] != i) { + return i; +} - printf("Copy #%d at %d,%d failed: read 0x%08x\n", - i, x, y, map[i]); +static void blt_copy(struct intel_batchbuffer *batch, + struct scratch_buf *src, unsigned src_x, unsigned src_y, + unsigned width, unsigned height, + struct scratch_buf *dst, unsigned dst_x, unsigned dst_y) +{ + BEGIN_BATCH(8); + OUT_BATCH(XY_SRC_COPY_BLT_CMD | + XY_SRC_COPY_BLT_WRITE_ALPHA | + XY_SRC_COPY_BLT_WRITE_RGB); + OUT_BATCH((3 << 24) | /* 32 bits */ + (0xcc << 16) | /* copy ROP */ + dst->stride); + OUT_BATCH((dst_y << 16) | dst_x); /* dst x1,y1 */ + OUT_BATCH(((dst_y + width) << 16) | (dst_x + width)); /* dst x2,y2 */ + OUT_RELOC(dst->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); + OUT_BATCH((src_y << 16) | src_x); /* src x1,y1 */ + OUT_BATCH(src->stride); + OUT_RELOC(src->bo, I915_GEM_DOMAIN_RENDER, 0, 0); + ADVANCE_BATCH(); + + intel_batchbuffer_flush(batch); +} - if (fails++ > 9) - exit(1); - } - } - drm_intel_bo_unmap(dst_bo); +int main(int argc, char **argv) +{ + drm_intel_bufmgr *bufmgr; + struct intel_batchbuffer *batch; + render_copyfunc_t copy; + int fd, fails = 0; + + fd = drm_open_any(); + + bufmgr = drm_intel_bufmgr_gem_init(fd, 4096); + drm_intel_bufmgr_gem_enable_reuse(bufmgr); + batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd)); + + fails += check_ring(bufmgr, batch, "blt", blt_copy); + + /* Strictly only required on architectures with a separate BLT ring, + * but lets stress everybody. + */ + copy = NULL; + if (IS_GEN2(batch->devid)) + copy = gen2_render_copyfunc; + else if (IS_GEN3(batch->devid)) + copy = gen3_render_copyfunc; + else if (IS_GEN6(batch->devid)) + copy = gen6_render_copyfunc; + if (copy) + fails += check_ring(bufmgr, batch, "render", copy); intel_batchbuffer_free(batch); drm_intel_bufmgr_destroy(bufmgr); -- 1.7.9