Signed-off-by: Tom St Denis <tom.stdenis at amd.com> --- doc/sphinx/source/index.rst | 1 + doc/sphinx/source/profiler.rst | 36 ++++++++++++ doc/umr.1 | 4 ++ src/app/CMakeLists.txt | 1 + src/app/main.c | 15 ++++- src/app/print_waves.c | 4 +- src/app/profile.c | 128 +++++++++++++++++++++++++++++++++++++++++ src/app/ring_read.c | 12 +++- src/lib/dump_ib.c | 4 +- src/lib/umr_llvm_disasm.c | 48 +++++++++++++++- src/umr.h | 3 +- src/umrapp.h | 4 +- 12 files changed, 246 insertions(+), 14 deletions(-) create mode 100644 doc/sphinx/source/profiler.rst create mode 100644 src/app/profile.c diff --git a/doc/sphinx/source/index.rst b/doc/sphinx/source/index.rst index fd8b2561e570..fec89140db70 100644 --- a/doc/sphinx/source/index.rst +++ b/doc/sphinx/source/index.rst @@ -15,6 +15,7 @@ UMR: User Mode Register Debugger basic register_access wave_status + profiler vm_decoding ring top diff --git a/doc/sphinx/source/profiler.rst b/doc/sphinx/source/profiler.rst new file mode 100644 index 000000000000..0e44cfd2825d --- /dev/null +++ b/doc/sphinx/source/profiler.rst @@ -0,0 +1,36 @@ +========= +Profiling +========= + +When testing a shader compiler and/or a shader under testing +a profile of where the GPU tends to spend time can be generated with +the umr "--profiler" command: + +:: + + --profiler <nsamples> <usec_delay> + +Which will capture 'nsamples' many wave samples with a delay of at +least 'usec_delay' microseconds between them. The output then +contains the sorted list of addresses and opcodes in descending order. +For example, + +:: + + 2865 hits (13 %) 2 at 0x100009c68 0xc4001c0f 0x00000100 exp mrt0 v0, v0, v1, v1 done compr vm + 1199 hits ( 5 %) 2 at 0x1055e9724 0xc40008cf 0x0f090706 exp pos0 v6, v7, v9, v15 done + 1155 hits ( 5 %) 2 at 0x100009c48 0xbf8c0f70 0x16000080 s_waitcnt vmcnt(0) + 710 hits ( 3 %) 2 at 0x10000acf0 0xc4001c0f 0x00000100 exp mrt0 v0, v0, v1, v1 done compr vm + 633 hits ( 3 %) 2 at 0x1023f14c4 0xc400040f 0x00000100 exp mrt0 v0, v0, v1, v1 compr + 633 hits ( 3 %) 2 at 0x100008d64 0xbf8c0f70 0x0a161b12 s_waitcnt vmcnt(0) + 617 hits ( 2 %) 2 at 0x10000a238 0xf0800700 0x00020400 image_sample v[4:6], v0, s[8:15], s[0:3] dmask:0x7 + ...<snip>... + +Indicates that the opcode at VMID 2 offset 0x100009C68 had waves halted +there 2865 times (13% of all captured wave data). The next columns +indicate the raw opcode data and the last columns are the LLVM disassembly +of the opcode. + +When testing a known shader this can be used to determine where +the bulk of the processing time is spent. + diff --git a/doc/umr.1 b/doc/umr.1 index f1f5fec55946..a777d9312054 100644 --- a/doc/umr.1 +++ b/doc/umr.1 @@ -118,6 +118,10 @@ from stdin. Disassemble 'size' bytes (in hex) from a given address (in hex). The size can be specified as zero to have umr try and compute the shader size. +.IP "--profiler, -prof <nsamples> <usec_delay>" +Capture 'nsamples' samples of wave data with at least usec_delay microseconds +between captures. + .IP "--update, -u" <filename> Specify update file to add, change, or delete registers from the register database. Useful for adding registers that are not including in the kernel headers. diff --git a/src/app/CMakeLists.txt b/src/app/CMakeLists.txt index 4dceebb00e0d..7512a54f68bf 100644 --- a/src/app/CMakeLists.txt +++ b/src/app/CMakeLists.txt @@ -6,6 +6,7 @@ project(umr) add_library(umrapp print.c print_config.c + profile.c ring_read.c scan.c scan_log.c diff --git a/src/app/main.c b/src/app/main.c index 600f3ca02988..d6571e77b74d 100644 --- a/src/app/main.c +++ b/src/app/main.c @@ -495,13 +495,23 @@ int main(int argc, char **argv) shader.addr = address; size = umr_compute_shader_size(asic, &shader); } - umr_vm_disasm(asic, vmid, address, 0, size); + umr_vm_disasm(asic, vmid, address, 0, size, NULL); i += 2; } else { printf("--vm-disasm requires two parameters\n"); return EXIT_FAILURE; } + } else if (!strcmp(argv[i], "-prof") || !strcmp(argv[i], "--profiler")) { + if (i + 2 < argc) { + if (!asic) + asic = get_asic(); + umr_profiler(asic, atoi(argv[i+1]), atoi(argv[i+2])); + i += 2; + } else { + printf("--profiler requires two parameters\n"); + return EXIT_FAILURE; + } } else if (!strcmp(argv[i], "--option") || !strcmp(argv[i], "-O")) { if (i + 1 < argc) { parse_options(argv[i+1]); @@ -581,6 +591,9 @@ int main(int argc, char **argv) "\n\t--vm-disasm, -vdis [<vmid>@]<address> <size>" "\n\t\tDisassemble 'size' bytes (in hex) from a given address (in hex). The size can" "\n\t\tbe specified as zero to have umr try and compute the shader size.\n" +"\n\t--profiler, -prof <nsamples> <usec_delay>" + "\n\t\tCapture 'nsamples' samples of wave data with at least usec_delay" + "\n\t\tmicroseconds between captures.\n" "\n\t--option -O <string>[,<string>,...]\n\t\tEnable various flags: bits, bitsfull, empty_log, follow, no_follow_ib, named, many," "\n\t\tuse_pci, use_colour, read_smc, quiet, no_kernel, verbose, halt_waves, disasm_early_term.\n" "\n\n", UMR_BUILD_VER, UMR_BUILD_REV); diff --git a/src/app/print_waves.c b/src/app/print_waves.c index d901bc902ff3..6965f7f31854 100644 --- a/src/app/print_waves.c +++ b/src/app/print_waves.c @@ -100,7 +100,7 @@ void umr_print_waves(struct umr_asic *asic) } pgm_addr = (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo) - (NUM_OPCODE_WORDS*4)/2; - umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4); + umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4, NULL); } else { first = 0; printf("\n------------------------------------------------------\nse%u.sh%u.cu%u.simd%u.wave%u\n", @@ -222,7 +222,7 @@ void umr_print_waves(struct umr_asic *asic) printf("\n\nPGM_MEM:\n"); pgm_addr = (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo) - (NUM_OPCODE_WORDS*4)/2; - umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4); + umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4, NULL); Hv("LDS_ALLOC", wd->ws.lds_alloc.value); PP(lds_alloc, lds_base); diff --git a/src/app/profile.c b/src/app/profile.c new file mode 100644 index 000000000000..3ba3b36efe64 --- /dev/null +++ b/src/app/profile.c @@ -0,0 +1,128 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: Tom St Denis <tom.stdenis at amd.com> + * + */ +#include "umrapp.h" + +struct umr_profiler_hit { + uint32_t + vmid, + inst_dw0, + inst_dw1; + + uint64_t + pc; +}; + +struct umr_profiler_rle { + struct umr_profiler_hit data; + uint32_t cnt; +}; + +static int comp_hits(const void *A, const void *B) +{ + return memcmp(A, B, sizeof(struct umr_profiler_hit)); +} + +static int comp_rle(const void *A, const void *B) +{ + const struct umr_profiler_rle *a = A, *b = B; + return b->cnt - a->cnt; +} + +void umr_profiler(struct umr_asic *asic, int samples, int delay) +{ + struct umr_profiler_hit *ophit, *phit; + struct umr_profiler_rle *prle; + struct umr_wave_data *owd, *wd; + unsigned nitems, nmax, x, y, z; + + nmax = samples; + nitems = 0; + ophit = phit = calloc(nmax, sizeof *phit); + + while (samples--) { + fprintf(stderr, "%5u samples left\r", samples); + fflush(stderr); + do { + umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME); + if (delay) + usleep(delay); + umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_HALT); + wd = umr_scan_wave_data(asic); + } while (!wd); + + // loop through data ... + while (wd) { + phit[nitems].vmid = wd->ws.hw_id.vm_id; + phit[nitems].inst_dw0 = wd->ws.wave_inst_dw0; + phit[nitems].inst_dw1 = wd->ws.wave_inst_dw1; + phit[nitems++].pc = ((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo; + + if (nitems == nmax) { + nmax += 1000; + ophit = realloc(phit, nmax * sizeof(*phit)); + phit = ophit; + } + + owd = wd->next; + free(wd); + wd = owd; + } + } + umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME); + + qsort(phit, nitems, sizeof(*phit), comp_hits); + prle = calloc(nitems, sizeof *prle); + for (z = y = 0, x = 1; x < nitems; x++) { + if (memcmp(&phit[x], &phit[y], sizeof(*phit))) { + prle[z].data = phit[y]; + prle[z++].cnt = x - y; + y = x; + } + } + + qsort(prle, z, sizeof(*prle), comp_rle); + for (x = 0; x < z; x++) { + char *str[2]; + unsigned char buf[8]; + + memset(str, 0, sizeof(str)); + memcpy(buf, &prle[x].data.inst_dw0, 4); + memcpy(buf + 4, &prle[x].data.inst_dw1, 4); + umr_llvm_disasm(asic, buf, 8, 0, &str[0]); + + printf("%5u hits (%2u %%)\t%u at 0x%llx\t 0x%08lx 0x%08lx\t%s\n", + prle[x].cnt, + (prle[x].cnt * 100) / nitems, + (unsigned)prle[x].data.vmid, + (unsigned long long)prle[x].data.pc, + (unsigned long)prle[x].data.inst_dw0, + (unsigned long)prle[x].data.inst_dw1, str[0]); + free(str[0]); + free(str[1]); + } + + free(prle); + free(phit); +} diff --git a/src/app/ring_read.c b/src/app/ring_read.c index 3ccec1be6d90..112e9f0414ad 100644 --- a/src/app/ring_read.c +++ b/src/app/ring_read.c @@ -32,6 +32,7 @@ void umr_read_ring(struct umr_asic *asic, char *ringpath) uint32_t wptr, rptr, drv_wptr, ringsize, start, end, value, *ring_data; struct umr_ring_decoder decoder, *pdecoder, *ppdecoder; + struct umr_wave_data *wd; memset(ringname, 0, sizeof ringname); memset(from, 0, sizeof from); @@ -146,18 +147,25 @@ void umr_read_ring(struct umr_asic *asic, char *ringpath) free(ring_data); printf("\n"); - umr_dump_shaders(asic, &decoder); + wd = umr_scan_wave_data(asic); + umr_dump_shaders(asic, &decoder, wd); pdecoder = decoder.next_ib; while (pdecoder) { if (asic->options.follow_ib) { umr_dump_ib(asic, pdecoder); - umr_dump_shaders(asic, pdecoder); + umr_dump_shaders(asic, pdecoder, wd); } ppdecoder = pdecoder->next_ib; free(pdecoder); pdecoder = ppdecoder; } + while (wd) { + struct umr_wave_data *pwd = wd->next; + free(wd); + wd = pwd; + } + end: if (asic->options.halt_waves) umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME); diff --git a/src/lib/dump_ib.c b/src/lib/dump_ib.c index cdcbb8a70edd..d5e68d6981a0 100644 --- a/src/lib/dump_ib.c +++ b/src/lib/dump_ib.c @@ -67,7 +67,7 @@ void umr_dump_ib(struct umr_asic *asic, struct umr_ring_decoder *decoder) printf("End of IB\n\n"); } -void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder) +void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder, struct umr_wave_data *wd) { struct umr_shaders_pgm *pshader, *shader; @@ -79,7 +79,7 @@ void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder) BLUE, (unsigned)shader->vmid, RST, YELLOW, (unsigned long long)shader->src.ib_base, RST, YELLOW, (unsigned)shader->src.ib_offset * 4, RST); - umr_vm_disasm(asic, shader->vmid, shader->addr, 0, shader->size); + umr_vm_disasm(asic, shader->vmid, shader->addr, 0, shader->size, wd); printf("\n"); pshader = shader->next; free(shader); diff --git a/src/lib/umr_llvm_disasm.c b/src/lib/umr_llvm_disasm.c index 68f23f990fd2..5e1adf39a262 100644 --- a/src/lib/umr_llvm_disasm.c +++ b/src/lib/umr_llvm_disasm.c @@ -85,10 +85,31 @@ int umr_llvm_disasm(struct umr_asic *asic, return 0; } -void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size) +static struct umr_wave_data *find_wave(struct umr_wave_data *wd, unsigned vmid, uint64_t addr) { - uint32_t *opcodes, x; + while (wd) { + uint64_t PC; + PC = ((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo; + if (wd->ws.hw_id.vm_id == vmid && addr == PC) + break; + wd = wd->next; + } + return wd; +} + + +void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size, struct umr_wave_data *wd) +{ + uint32_t *opcodes, x, nwave, wavehits; char **opcode_strs = NULL; + struct umr_wave_data *pwd; + + wavehits = nwave = 0; + pwd = wd; + while (pwd) { + ++nwave; + pwd = pwd->next; + } opcodes = calloc(size/4, sizeof(*opcodes)); if (!opcodes) @@ -106,14 +127,35 @@ void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t printf(" * "); else printf(" "); - printf("pgm[%s%lu%s@%s0x%llx%s + %s0x%-4x%s] = %s0x%08lx%s\t%s%s%s\n", + printf("pgm[%s%lu%s@%s0x%llx%s + %s0x%-4x%s] = %s0x%08lx%s\t%s%-60s%s\t", BLUE, (unsigned long)vmid, RST, YELLOW, (unsigned long long)addr, RST, YELLOW, (unsigned)x * 4, RST, BLUE, (unsigned long)opcodes[x], RST, GREEN, opcode_strs[x], RST); free(opcode_strs[x]); + + if (wd) { + unsigned n; + pwd = find_wave(wd, vmid, addr + x * 4); + n = 0; + while (pwd) { + ++n; + ++wavehits; + if (asic->options.bitfields) + printf("[se%u.sh%u.cu%u.simd%u.wave%u] ", + (unsigned)pwd->se, (unsigned)pwd->sh, (unsigned)pwd->cu, (unsigned)pwd->ws.hw_id.simd_id, (unsigned)pwd->ws.hw_id.wave_id); + pwd = find_wave(pwd->next, vmid, addr + x * 4); + } + if (n) + printf("[%3u waves (%3u %%)]", n, (n * 100) / nwave); + } + printf("\n"); } + printf("End of disassembly.\n"); + + if (wd && wavehits) + printf("\t%u waves in this shader (out of %u active waves)\n", wavehits, nwave); free(opcode_strs); free(opcodes); diff --git a/src/umr.h b/src/umr.h index e99ee965527e..f026e82be98e 100644 --- a/src/umr.h +++ b/src/umr.h @@ -621,12 +621,13 @@ int umr_sq_cmd_halt_waves(struct umr_asic *asic, enum umr_sq_cmd_halt_resume mod /* IB/ring decoding/dumping/etc */ void umr_print_decode(struct umr_asic *asic, struct umr_ring_decoder *decoder, uint32_t ib); void umr_dump_ib(struct umr_asic *asic, struct umr_ring_decoder *decoder); -void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder); +void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder, struct umr_wave_data *wd); int umr_llvm_disasm(struct umr_asic *asic, uint8_t *inst, unsigned inst_bytes, uint64_t PC, char **disasm_text); +void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size, struct umr_wave_data *wd); uint32_t umr_compute_shader_size(struct umr_asic *asic, struct umr_shaders_pgm *shader); diff --git a/src/umrapp.h b/src/umrapp.h index 2f52d3093abe..e11a7d6e53f5 100644 --- a/src/umrapp.h +++ b/src/umrapp.h @@ -48,6 +48,4 @@ void umr_top(struct umr_asic *asic); void umr_print_config(struct umr_asic *asic); void umr_print_waves(struct umr_asic *asic); - -void umr_app_disasm(struct umr_asic *asic); -void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size); +void umr_profiler(struct umr_asic *asic, int samples, int delay); -- 2.14.3