On Wed, Mar 07, 2012 at 11:21:07AM -0800, Eric Anholt wrote: > From: Kenneth Graunke <kenneth at whitecape.org> > > This will allow the driver to capture all of its execution state to a > file for later debugging. intel_gpu_dump is limited in that it only > captures batchbuffers, and Mesa's captures, while more complete, still > capture only a portion of the state involved in execution. > > It also enables us to load traces in our internal simulator. > > Signed-off-by: Eric Anholt <eric at anholt.net> > Signed-off-by: Yuanhan Liu <yuanhan.liu at linux.intel.com> > Signed-off-by: Kenneth Graunke <kenneth at whitecape.org> > --- > intel/Makefile.am | 1 + > intel/intel_aub.h | 123 ++++++++++++++++++ > intel/intel_bufmgr.h | 14 ++ > intel/intel_bufmgr_gem.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 453 insertions(+), 0 deletions(-) > create mode 100644 intel/intel_aub.h > > diff --git a/intel/Makefile.am b/intel/Makefile.am > index 06362b6..dc01a96 100644 > --- a/intel/Makefile.am > +++ b/intel/Makefile.am > @@ -53,6 +53,7 @@ intel_bufmgr_gem_o_CFLAGS = $(AM_CFLAGS) -c99 > > libdrm_intelincludedir = ${includedir}/libdrm > libdrm_intelinclude_HEADERS = intel_bufmgr.h \ > + intel_aub.h \ > intel_debug.h > > # This may be interesting even outside of "make check", due to the -dump option. > diff --git a/intel/intel_aub.h b/intel/intel_aub.h > new file mode 100644 > index 0000000..a36fd53 > --- /dev/null > +++ b/intel/intel_aub.h > @@ -0,0 +1,123 @@ > +/* > + * Copyright ? 2010 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS > + * IN THE SOFTWARE. > + * > + * Authors: > + * Eric Anholt <eric at anholt.net> > + * > + */ > + > +/** @file intel_aub.h > + * > + * The AUB file is a file format used by Intel's internal simulation > + * and other validation tools. It can be used at various levels by a > + * driver to input state to the simulated hardware or a replaying > + * debugger. > + * > + * We choose to dump AUB files using the trace block format for ease > + * of implementation -- dump out the blocks of memory as plain blobs > + * and insert ring commands to execute the batchbuffer blob. > + */ > + > +#ifndef _INTEL_AUB_H > +#define _INTEL_AUB_H > + > +#define AUB_MI_NOOP (0) > +#define AUB_MI_BATCH_BUFFER_START (0x31 << 23) > +#define AUB_PIPE_CONTROL (0x7a000002) > + > +/* DW0: instruction type. */ > + > +#define CMD_AUB (7 << 29) > + > +#define CMD_AUB_HEADER (CMD_AUB | (1 << 23) | (0x05 << 16)) > +/* DW1 */ > +# define AUB_HEADER_MAJOR_SHIFT 24 > +# define AUB_HEADER_MINOR_SHIFT 16 > + > +#define CMD_AUB_TRACE_HEADER_BLOCK (CMD_AUB | (1 << 23) | (0x41 << 16)) > +#define CMD_AUB_DUMP_BMP (CMD_AUB | (1 << 23) | (0x9e << 16)) > + > +/* DW1 */ > +#define AUB_TRACE_OPERATION_MASK 0x000000ff > +#define AUB_TRACE_OP_COMMENT 0x00000000 > +#define AUB_TRACE_OP_DATA_WRITE 0x00000001 > +#define AUB_TRACE_OP_COMMAND_WRITE 0x00000002 > +#define AUB_TRACE_OP_MMIO_WRITE 0x00000003 > +// operation = TRACE_DATA_WRITE, Type > +#define AUB_TRACE_TYPE_MASK 0x0000ff00 > +#define AUB_TRACE_TYPE_NOTYPE (0 << 8) > +#define AUB_TRACE_TYPE_BATCH (1 << 8) > +#define AUB_TRACE_TYPE_VERTEX_BUFFER (5 << 8) > +#define AUB_TRACE_TYPE_2D_MAP (6 << 8) > +#define AUB_TRACE_TYPE_CUBE_MAP (7 << 8) > +#define AUB_TRACE_TYPE_VOLUME_MAP (9 << 8) > +#define AUB_TRACE_TYPE_1D_MAP (10 << 8) > +#define AUB_TRACE_TYPE_CONSTANT_BUFFER (11 << 8) > +#define AUB_TRACE_TYPE_CONSTANT_URB (12 << 8) > +#define AUB_TRACE_TYPE_INDEX_BUFFER (13 << 8) > +#define AUB_TRACE_TYPE_GENERAL (14 << 8) > +#define AUB_TRACE_TYPE_SURFACE (15 << 8) > + > + > +// operation = TRACE_COMMAND_WRITE, Type = > +#define AUB_TRACE_TYPE_RING_HWB (1 << 8) > +#define AUB_TRACE_TYPE_RING_PRB0 (2 << 8) > +#define AUB_TRACE_TYPE_RING_PRB1 (3 << 8) > +#define AUB_TRACE_TYPE_RING_PRB2 (4 << 8) > + > +// Address space > +#define AUB_TRACE_ADDRESS_SPACE_MASK 0x00ff0000 > +#define AUB_TRACE_MEMTYPE_GTT (0 << 16) > +#define AUB_TRACE_MEMTYPE_LOCAL (1 << 16) > +#define AUB_TRACE_MEMTYPE_NONLOCAL (2 << 16) > +#define AUB_TRACE_MEMTYPE_PCI (3 << 16) > +#define AUB_TRACE_MEMTYPE_GTT_ENTRY (4 << 16) > + > +/* DW2 */ > +// operation = TRACE_DATA_WRITE, Type = TRACE_DATA_WRITE_GENERAL_STATE > +#define AUB_TRACE_GENERAL_STATE_MASK 0x000000ff > + > +#define AUB_TRACE_VS_STATE 0x00000001 > +#define AUB_TRACE_GS_STATE 0x00000002 > +#define AUB_TRACE_CL_STATE 0x00000003 > +#define AUB_TRACE_SF_STATE 0x00000004 > +#define AUB_TRACE_WM_STATE 0x00000005 > +#define AUB_TRACE_CC_STATE 0x00000006 > +#define AUB_TRACE_CL_VP 0x00000007 > +#define AUB_TRACE_SF_VP 0x00000008 > +#define AUB_TRACE_CC_VP 0x00000009 > +#define AUB_TRACE_SAMPLER_STATE 0x0000000a > +#define AUB_TRACE_KERNEL 0x0000000b > +#define AUB_TRACE_SCRATCH 0x0000000c > +#define AUB_TRACE_SDC 0x0000000d > +#define AUB_TRACE_BLEND_STATE 0x00000016 > +#define AUB_TRACE_DEPTH_STENCIL_STATE 0x00000017 > + > +// operation = TRACE_DATA_WRITE, Type = TRACE_DATA_WRITE_SURFACE_STATE > +#define AUB_TRACE_SURFACE_STATE_MASK 0x00000ff00 > +#define AUB_TRACE_BINDING_TABLE 0x000000100 > +#define AUB_TRACE_SURFACE_STATE 0x000000200 > + > +/* DW3: address */ > +/* DW4: len */ > + > +#endif /* _INTEL_AUB_H */ > diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h > index 8036031..fa6f2b8 100644 > --- a/intel/intel_bufmgr.h > +++ b/intel/intel_bufmgr.h > @@ -36,6 +36,7 @@ > > #include <stdio.h> > #include <stdint.h> > +#include <stdio.h> > > struct drm_clip_rect; > > @@ -84,6 +85,13 @@ struct _drm_intel_bo { > int handle; > }; > > +enum aub_dump_bmp_format { > + AUB_DUMP_BMP_FORMAT_8BIT = 1, > + AUB_DUMP_BMP_FORMAT_ARGB_4444 = 4, > + AUB_DUMP_BMP_FORMAT_ARGB_0888 = 6, > + AUB_DUMP_BMP_FORMAT_ARGB_8888 = 7, > +}; > + > #define BO_ALLOC_FOR_RENDER (1<<0) > > drm_intel_bo *drm_intel_bo_alloc(drm_intel_bufmgr *bufmgr, const char *name, > @@ -154,6 +162,12 @@ int drm_intel_gem_bo_get_reloc_count(drm_intel_bo *bo); > void drm_intel_gem_bo_clear_relocs(drm_intel_bo *bo, int start); > void drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable); > > +void drm_intel_bufmgr_gem_set_aub_dump(drm_intel_bufmgr *bufmgr, int enable); > +void drm_intel_gem_bo_aub_dump_bmp(drm_intel_bo *bo, > + int x1, int y1, int width, int height, > + enum aub_dump_bmp_format format, > + int pitch, int offset); > + > int drm_intel_get_pipe_from_crtc_id(drm_intel_bufmgr *bufmgr, int crtc_id); > > int drm_intel_get_aperture_sizes(int fd, size_t *mappable, size_t *total); > diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c > index ba38e50..45585f7 100644 > --- a/intel/intel_bufmgr_gem.c > +++ b/intel/intel_bufmgr_gem.c > @@ -58,6 +58,7 @@ > #include "intel_bufmgr.h" > #include "intel_bufmgr_priv.h" > #include "intel_chipset.h" > +#include "intel_aub.h" > #include "string.h" > > #include "i915_drm.h" > @@ -121,6 +122,9 @@ typedef struct _drm_intel_bufmgr_gem { > unsigned int bo_reuse : 1; > unsigned int no_exec : 1; > bool fenced_relocs; > + > + FILE *aub_file; > + uint32_t aub_offset; > } drm_intel_bufmgr_gem; > > #define DRM_INTEL_RELOC_FENCE (1<<0) > @@ -215,6 +219,8 @@ struct _drm_intel_bo_gem { > > /** Flags that we may need to do the SW_FINSIH ioctl on unmap. */ > bool mapped_cpu_write; > + > + uint32_t aub_offset; > }; > > static unsigned int > @@ -1715,6 +1721,247 @@ drm_intel_update_buffer_offsets2 (drm_intel_bufmgr_gem *bufmgr_gem) > } > } > > +static void > +aub_out(drm_intel_bufmgr_gem *bufmgr_gem, uint32_t data) > +{ > + fwrite(&data, 1, 4, bufmgr_gem->aub_file); > +} > + > +static void > +aub_out_data(drm_intel_bufmgr_gem *bufmgr_gem, void *data, size_t size) > +{ > + fwrite(data, 1, size, bufmgr_gem->aub_file); > +} > + > +static void > +aub_write_bo_data(drm_intel_bo *bo, uint32_t offset, uint32_t size) > +{ > + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; > + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; > + uint32_t *data; > + unsigned int i; > + > + data = malloc(bo->size); > + drm_intel_bo_get_subdata(bo, offset, size, data); > + > + /* Easy mode: write out bo with no relocations */ > + if (!bo_gem->reloc_count) { > + aub_out_data(bufmgr_gem, data, size); > + free(data); > + return; > + } > + > + /* Otherwise, handle the relocations while writing. */ > + for (i = 0; i < size / 4; i++) { > + int r; > + for (r = 0; r < bo_gem->reloc_count; r++) { > + struct drm_i915_gem_relocation_entry *reloc; > + drm_intel_reloc_target *info; > + > + reloc = &bo_gem->relocs[r]; > + info = &bo_gem->reloc_target_info[r]; > + > + if (reloc->offset == offset + i * 4) { > + drm_intel_bo_gem *target_gem; > + uint32_t val; > + > + target_gem = (drm_intel_bo_gem *)info->bo; > + > + val = reloc->delta; > + val += target_gem->aub_offset; > + > + aub_out(bufmgr_gem, val); > + data[i] = val; > + break; > + } > + } > + if (r == bo_gem->reloc_count) { > + /* no relocation, just the data */ > + aub_out(bufmgr_gem, data[i]); > + } > + } > + > + free(data); > +} > + > +static void > +aub_bo_get_address(drm_intel_bo *bo) > +{ > + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; > + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; > + > + /* Give the object a graphics address in the AUB file. We > + * don't just use the GEM object address because we do AUB > + * dumping before execution -- we want to successfully log > + * when the hardware might hang, and we might even want to aub > + * capture for a driver trying to execute on a different > + * generation of hardware by disabling the actual kernel exec > + * call. > + */ > + bo_gem->aub_offset = bufmgr_gem->aub_offset; > + bufmgr_gem->aub_offset += bo->size; > + /* XXX: Handle aperture overflow. */ > + assert(bufmgr_gem->aub_offset < 256 * 1024 * 1024); > +} > + > +static void > +aub_write_trace_block(drm_intel_bo *bo, uint32_t type, uint32_t subtype, > + uint32_t offset, uint32_t size) > +{ > + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; > + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; > + > + aub_out(bufmgr_gem, > + CMD_AUB_TRACE_HEADER_BLOCK | > + (5 - 2)); > + aub_out(bufmgr_gem, > + AUB_TRACE_MEMTYPE_GTT | type | AUB_TRACE_OP_DATA_WRITE); > + aub_out(bufmgr_gem, subtype); > + aub_out(bufmgr_gem, bo_gem->aub_offset + offset); > + aub_out(bufmgr_gem, size); > + aub_write_bo_data(bo, offset, size); > +} > + > +static void > +aub_write_bo(drm_intel_bo *bo) > +{ > + uint32_t block_size; > + uint32_t offset; > + > + aub_bo_get_address(bo); > + > + /* Break up large objects into multiple writes. Otherwise a > + * 128kb VBO would overflow the 16 bits of size field in the > + * packet header and everything goes badly after that. > + */ > + for (offset = 0; offset < bo->size; offset += block_size) { > + block_size = bo->size - offset; > + > + if (block_size > 8 * 4096) > + block_size = 8 * 4096; > + > + aub_write_trace_block(bo, AUB_TRACE_TYPE_NOTYPE, 0, > + offset, block_size); > + } > +} > + > +/* > + * Make a ringbuffer on fly and dump it > + */ > +static void > +aub_build_dump_ringbuffer(drm_intel_bufmgr_gem *bufmgr_gem, > + uint32_t batch_buffer, int ring_flag) > +{ > + uint32_t ringbuffer[4096]; > + int ring = AUB_TRACE_TYPE_RING_PRB0; /* The default ring */ > + int ring_count = 0; > + > + if (ring_flag == I915_EXEC_BSD) > + ring = AUB_TRACE_TYPE_RING_PRB1; > + > + /* Make a ring buffer to execute our batchbuffer. */ > + memset(ringbuffer, 0, sizeof(ringbuffer)); > + ringbuffer[ring_count++] = AUB_MI_BATCH_BUFFER_START; > + ringbuffer[ring_count++] = batch_buffer; > + > + /* Write out the ring. This appears to trigger execution of > + * the ring in the simulator. > + */ > + aub_out(bufmgr_gem, > + CMD_AUB_TRACE_HEADER_BLOCK | > + (5 - 2)); > + aub_out(bufmgr_gem, > + AUB_TRACE_MEMTYPE_GTT | ring | AUB_TRACE_OP_COMMAND_WRITE); > + aub_out(bufmgr_gem, 0); /* general/surface subtype */ > + aub_out(bufmgr_gem, bufmgr_gem->aub_offset); > + aub_out(bufmgr_gem, ring_count * 4); > + > + /* FIXME: Need some flush operations here? */ > + aub_out_data(bufmgr_gem, ringbuffer, ring_count * 4); > + > + /* Update offset pointer */ > + bufmgr_gem->aub_offset += 4096; > +} > + > +void > +drm_intel_gem_bo_aub_dump_bmp(drm_intel_bo *bo, > + int x1, int y1, int width, int height, > + enum aub_dump_bmp_format format, > + int pitch, int offset) > +{ > + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; > + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *)bo; > + uint32_t cpp; > + > + switch (format) { > + case AUB_DUMP_BMP_FORMAT_8BIT: > + cpp = 1; > + break; > + case AUB_DUMP_BMP_FORMAT_ARGB_4444: > + cpp = 2; > + break; > + case AUB_DUMP_BMP_FORMAT_ARGB_0888: > + case AUB_DUMP_BMP_FORMAT_ARGB_8888: > + cpp = 4; > + break; > + default: > + printf("Unknown AUB dump format %d\n", format); > + return; > + } > + > + if (!bufmgr_gem->aub_file) > + return; > + > + aub_out(bufmgr_gem, CMD_AUB_DUMP_BMP | 4); > + aub_out(bufmgr_gem, (y1 << 16) | x1); > + aub_out(bufmgr_gem, > + (format << 24) | > + (cpp << 19) | > + pitch / 4); > + aub_out(bufmgr_gem, (height << 16) | width); > + aub_out(bufmgr_gem, bo_gem->aub_offset + offset); > + aub_out(bufmgr_gem, > + ((bo_gem->tiling_mode != I915_TILING_NONE) ? (1 << 2) : 0) | > + ((bo_gem->tiling_mode == I915_TILING_Y) ? (1 << 3) : 0)); > +} > + > +static void > +aub_exec(drm_intel_bo *bo, int ring_flag, int used) > +{ > + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr; > + drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo; > + int i; > + > + if (!bufmgr_gem->aub_file) > + return; > + > + /* Write out all but the batchbuffer to AUB memory */ > + for (i = 0; i < bufmgr_gem->exec_count - 1; i++) { > + if (bufmgr_gem->exec_bos[i] != bo) > + aub_write_bo(bufmgr_gem->exec_bos[i]); > + } > + > + aub_bo_get_address(bo); > + > + /* Dump the batchbuffer. */ > + aub_write_trace_block(bo, AUB_TRACE_TYPE_BATCH, 0, > + 0, used); > + aub_write_trace_block(bo, AUB_TRACE_TYPE_NOTYPE, 0, > + used, bo->size - used); > + > + /* Dump ring buffer */ > + aub_build_dump_ringbuffer(bufmgr_gem, bo_gem->aub_offset, ring_flag); > + > + fflush(bufmgr_gem->aub_file); > + > + /* > + * One frame has been dumped. So reset the aub_offset for the next frame. > + * > + * FIXME: Can we do this? > + */ > + bufmgr_gem->aub_offset = 0x10000; > +} > + > static int > drm_intel_gem_bo_exec(drm_intel_bo *bo, int used, > drm_clip_rect_t * cliprects, int num_cliprects, int DR4) > @@ -1830,6 +2077,8 @@ drm_intel_gem_bo_mrb_exec2(drm_intel_bo *bo, int used, > execbuf.rsvd1 = 0; > execbuf.rsvd2 = 0; > > + aub_exec(bo, flags, used); > + > if (bufmgr_gem->no_exec) > goto skip_execution; > > @@ -2360,6 +2609,72 @@ drm_intel_bufmgr_gem_get_devid(drm_intel_bufmgr *bufmgr) > } > > /** > + * Sets up AUB dumping. > + * > + * This is a trace file format that can be used with the simulator. > + * Packets are emitted in a format somewhat like GPU command packets. > + * You can set up a GTT and upload your objects into the referenced > + * space, then send off batchbuffers and get BMPs out the other end. > + */ > +void > +drm_intel_bufmgr_gem_set_aub_dump(drm_intel_bufmgr *bufmgr, int enable) > +{ > + drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *)bufmgr; > + int entry = 0x200003; > + int i; > + int gtt_size = 0x10000; > + > + if (!enable) { > + if (bufmgr_gem->aub_file) { > + fclose(bufmgr_gem->aub_file); > + bufmgr_gem->aub_file = NULL; > + } > + } > + > + if (geteuid() != getuid()) > + return; > + > + bufmgr_gem->aub_file = fopen("intel.aub", "w+"); I guess it's would be better that we can name the aub dump file according to the program we are tracing, like if we run: $ INTEL_DEBUG=aub glxgears I guess it would be good if we get a glxgears.aub but not intel.aub. Otherwise, it would override the former one we dumped. It's somehow a little un-convenient. > + if (!bufmgr_gem->aub_file) > + return; > + > + /* Start allocating objects from just after the GTT. */ > + bufmgr_gem->aub_offset = gtt_size; > + > + /* Start with a (required) version packet. */ > + aub_out(bufmgr_gem, CMD_AUB_HEADER | (13 - 2)); > + aub_out(bufmgr_gem, > + (4 << AUB_HEADER_MAJOR_SHIFT) | > + (0 << AUB_HEADER_MINOR_SHIFT)); > + for (i = 0; i < 8; i++) { > + aub_out(bufmgr_gem, 0); /* app name */ > + } > + aub_out(bufmgr_gem, 0); /* timestamp */ > + aub_out(bufmgr_gem, 0); /* timestamp */ > + aub_out(bufmgr_gem, 0); /* comment len */ > + > + /* Set up the GTT. The max we can handle is 256M */ > + aub_out(bufmgr_gem, CMD_AUB_TRACE_HEADER_BLOCK | (5 - 2)); > + aub_out(bufmgr_gem, AUB_TRACE_MEMTYPE_NONLOCAL | 0 | AUB_TRACE_OP_DATA_WRITE); > + aub_out(bufmgr_gem, 0); /* subtype */ > + aub_out(bufmgr_gem, 0); /* offset */ > + aub_out(bufmgr_gem, gtt_size); /* size */ > + for (i = 0x000; i < gtt_size; i += 4, entry += 0x1000) { > + aub_out(bufmgr_gem, entry); > + } > + > + /* MI_FLUSH enable */ > + if (bufmgr_gem->gen >= 6) { > + aub_out(bufmgr_gem, CMD_AUB_TRACE_HEADER_BLOCK | (5 - 2)); > + aub_out(bufmgr_gem, AUB_TRACE_OP_MMIO_WRITE); > + aub_out(bufmgr_gem, 0); > + aub_out(bufmgr_gem, 0x209c); /* reg addr */ > + aub_out(bufmgr_gem, 0x04); /* Length in byte */ > + aub_out(bufmgr_gem, ((1 << 12) << 16) | (1 << 12)); > + } Zhenyu and me came to an agreement that it's the driver side to do the MI_FLUSH enable stuff. Since using MI_FLUSH is deprecated, and if you still use it, it is your job to set up the MI_FLUSH enable bit. > +} > + > +/** > * Initializes the GEM buffer manager, which uses the kernel to allocate, map, > * and manage map buffer objections. > * > -- > 1.7.9.1