Print the GuC captured error state register list (string names and values) when gpu_coredump_state printout is invoked via the i915 debugfs for flushing the gpu error-state that was captured prior. Since GuC could have reported multiple engine register dumps in a single notification event, parse the captured data (appearing as a stream of structures) to identify each dump as a different 'engine-capture-group-output'. Finally, for each 'engine-capture-group-output' that is found, verify if the engine register dump corresponds to the engine_coredump content that was previously populated by the i915_gpu_coredump function. That function would have copied the context's vma's including the bacth buffer during the G2H-context-reset notification that occurred earlier. Perform this verification check by comparing guc_id, lrca and engine- instance obtained from the 'engine-capture-group-output' vs a copy of that same info taken during i915_gpu_coredump. If they match, then print those vma's as well (such as the batch buffers). Signed-off-by: Alan Previn <alan.previn.teres.alexis@xxxxxxxxx> --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 4 +- .../gpu/drm/i915/gt/uc/intel_guc_capture.c | 439 ++++++++++++++++++ .../gpu/drm/i915/gt/uc/intel_guc_capture.h | 10 +- drivers/gpu/drm/i915/i915_gpu_error.c | 65 ++- drivers/gpu/drm/i915/i915_gpu_error.h | 14 + 5 files changed, 509 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 4317ae5e525b..47c0c32d9b86 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -1628,9 +1628,7 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine, drm_printf(m, "\tIPEHR: 0x%08x\n", ENGINE_READ(engine, IPEHR)); } - if (intel_engine_uses_guc(engine)) { - /* nothing to print yet */ - } else if (HAS_EXECLISTS(dev_priv)) { + if (HAS_EXECLISTS(dev_priv) && !intel_engine_uses_guc(engine)) { struct i915_request * const *port, *rq; const u32 *hws = &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c index fc80c5f31915..1c8ad6a1c2d3 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c @@ -694,8 +694,423 @@ int intel_guc_capture_output_min_size_est(struct intel_guc *guc) * --> G2H STATE_CAPTURE_NOTIFICATION * L--> intel_guc_capture_store_snapshot * L--> Copies from B (head->tail) into C + * + * GUC --> notify context reset: + * ----------------------------- + * --> G2H CONTEXT RESET + * L--> guc_handle_context_reset --> i915_capture_error_state + * --> i915_gpu_coredump --> intel_guc_capture_store_ptr + * L--> keep a ptr to capture_store in + * i915_gpu_coredump struct. + * + * User Sysfs / Debugfs + * -------------------- + * --> i915_gpu_coredump_copy_to_buffer-> + * L--> err_print_to_sgl --> err_print_gt + * L--> error_print_guc_captures + * L--> loop: intel_guc_capture_out_print_next_group + * */ +#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) + +static const char * +guc_capture_register_to_string(const struct intel_guc *guc, u32 owner, u32 type, + u32 class, u32 id, u32 offset, u32 *is_ext) +{ + struct __guc_mmio_reg_descr_group *reglists = guc->capture.priv->reglists; + struct __guc_mmio_reg_descr_group *match; + int num_regs, j; + + *is_ext = 0; + if (!reglists) + return NULL; + + match = guc_capture_get_one_list(reglists, owner, type, id); + + if (match) { + for (num_regs = match->num_regs, j = 0; j < num_regs; ++j) { + if (offset == match->list[j].reg.reg) + return match->list[j].regname; + } + } + if (match->ext) { + for (num_regs = match->num_ext, j = 0; j < num_regs; ++j) { + if (offset == match->ext[j].reg.reg) { + *is_ext = 1; + return match->ext[j].regname; + } + } + } + + return NULL; +} + +static int +guc_capture_store_remove_dw(struct guc_capture_out_store *store, u32 *bytesleft, + u32 *dw) +{ + int tries = 2; + int avail = 0; + u32 *src_data; + + if (!*bytesleft) + return 0; + + while (tries--) { + avail = CIRC_CNT_TO_END(store->head, store->tail, store->size); + if (avail >= sizeof(u32)) { + src_data = (u32 *)(store->addr + store->tail); + *dw = *src_data; + store->tail = (store->tail + 4) & (store->size - 1); + *bytesleft -= 4; + return 4; + } + if (store->tail == (store->size - 1) && store->head > 0) + store->tail = 0; + } + + return 0; +} + +static int +guc_capture_store_get_group_hdr(const struct intel_guc *guc, + struct guc_capture_out_store *store, u32 *bytesleft, + struct guc_state_capture_group_header_t *ghdr) +{ + int read = 0; + int fullsize = sizeof(struct guc_state_capture_group_header_t); + + if (fullsize > *bytesleft) + return -1; + + if (CIRC_CNT_TO_END(store->head, store->tail, store->size) >= fullsize) { + memcpy(ghdr, (store->addr + store->tail), fullsize); + store->tail = (store->tail + fullsize) & (store->size - 1); + *bytesleft -= fullsize; + return 0; + } + + read += guc_capture_store_remove_dw(store, bytesleft, &ghdr->reserved1); + read += guc_capture_store_remove_dw(store, bytesleft, &ghdr->info); + if (read != sizeof(*ghdr)) + return -1; + + return 0; +} + +static int +guc_capture_store_get_data_hdr(const struct intel_guc *guc, + struct guc_capture_out_store *store, u32 *bytesleft, + struct guc_state_capture_header_t *hdr) +{ + int read = 0; + int fullsize = sizeof(struct guc_state_capture_header_t); + + if (fullsize > *bytesleft) + return -1; + + if (CIRC_CNT_TO_END(store->head, store->tail, store->size) >= fullsize) { + memcpy(hdr, (store->addr + store->tail), fullsize); + store->tail = (store->tail + fullsize) & (store->size - 1); + *bytesleft -= fullsize; + return 0; + } + + read += guc_capture_store_remove_dw(store, bytesleft, &hdr->reserved1); + read += guc_capture_store_remove_dw(store, bytesleft, &hdr->info); + read += guc_capture_store_remove_dw(store, bytesleft, &hdr->lrca); + read += guc_capture_store_remove_dw(store, bytesleft, &hdr->guc_id); + read += guc_capture_store_remove_dw(store, bytesleft, &hdr->num_mmios); + if (read != sizeof(*hdr)) + return -1; + + return 0; +} + +static int +guc_capture_store_get_register(const struct intel_guc *guc, + struct guc_capture_out_store *store, u32 *bytesleft, + struct guc_mmio_reg *reg) +{ + int read = 0; + int fullsize = sizeof(struct guc_mmio_reg); + + if (fullsize > *bytesleft) + return -1; + + if (CIRC_CNT_TO_END(store->head, store->tail, store->size) >= fullsize) { + memcpy(reg, (store->addr + store->tail), fullsize); + store->tail = (store->tail + fullsize) & (store->size - 1); + *bytesleft -= fullsize; + return 0; + } + + read += guc_capture_store_remove_dw(store, bytesleft, ®->offset); + read += guc_capture_store_remove_dw(store, bytesleft, ®->value); + read += guc_capture_store_remove_dw(store, bytesleft, ®->flags); + read += guc_capture_store_remove_dw(store, bytesleft, ®->mask); + if (read != sizeof(*reg)) + return -1; + + return 0; +} + +static void guc_capture_store_drop_data(struct guc_capture_out_store *store, + unsigned long sampled_head) +{ + if (sampled_head == 0) + store->tail = store->size - 1; + else + store->tail = sampled_head - 1; +} + +#ifdef CONFIG_DRM_I915_DEBUG_GUC +#define guc_capt_err_print(a, b, ...) \ + do { \ + drm_warn(a, __VA_ARGS__); \ + if (b) \ + i915_error_printf(b, __VA_ARGS__); \ + } while (0) +#else +#define guc_capt_err_print(a, b, ...) \ + do { \ + if (b) \ + i915_error_printf(b, __VA_ARGS__); \ + } while (0) +#endif + +static struct intel_engine_cs * +guc_capture_lookup_engine(struct intel_guc *guc, u8 guc_class, u8 instance) +{ + struct intel_gt *gt = guc_to_gt(guc); + u8 engine_class = guc_class_to_engine_class(guc_class); + + /* Class index is checked in class converter */ + GEM_BUG_ON(instance > MAX_ENGINE_INSTANCE); + + return gt->engine_class[engine_class][instance]; +} + +#define PRINT guc_capt_err_print +#define REGSTR guc_capture_register_to_string + +#define GCAP_PRINT_INTEL_ENG_INFO(i915, ebuf, eng) \ + do { \ + PRINT(&i915->drm, (ebuf), " i915-Eng-Name: %s command stream\n", (eng)->name); \ + PRINT(&i915->drm, (ebuf), " i915-Eng-Inst-Class: 0x%02x\n", (eng)->class); \ + PRINT(&i915->drm, (ebuf), " i915-Eng-Inst-Id: 0x%02x\n", (eng)->instance); \ + PRINT(&i915->drm, (ebuf), " i915-Eng-LogicalMask: 0x%08x\n", \ + (eng)->logical_mask); \ + } while (0) + +#define GCAP_PRINT_GUC_INST_INFO(i915, ebuf, hdr) \ + do { \ + PRINT(&i915->drm, (ebuf), " GuC-Engine-Inst-Id: 0x%08x\n", \ + (uint32_t)FIELD_GET(CAP_HDR_ENGINE_INSTANCE, (hdr).info)); \ + PRINT(&i915->drm, (ebuf), " GuC-Context-Id: 0x%08x\n", (hdr).guc_id); \ + PRINT(&i915->drm, (ebuf), " LRCA: 0x%08x\n", (hdr).lrca); \ + } while (0) + +#define GCAP_PRINT_INTEL_CTX_INFO(i915, ebuf, ce) \ + do { \ + PRINT(&i915->drm, (ebuf), " i915-Ctx-Flags: 0x%016lx\n", (ce)->flags); \ + PRINT(&i915->drm, (ebuf), " i915-Ctx-GuC-Id: 0x%016x\n", (ce)->guc_id.id); \ + } while (0) + +#define GCAP_PRINT_BATCH(i915, ebuf, ee, batch) \ + do { \ + batch = intel_gpu_error_find_batch(ee); \ + if (batch) { \ + u64 start = batch->gtt_offset; \ + u64 end = start + batch->gtt_size; \ + PRINT(&i915->drm, (ebuf), " batch: [0x%08x_%08x, 0x%08x_%08x]\n", \ + upper_32_bits(start), lower_32_bits(start), \ + upper_32_bits(end), lower_32_bits(end)); \ + } \ + } while (0) + +#define GCAP_PRINT_CONTEXT(i915, ebuf, ctx) \ + do { \ + const u32 period = to_gt(ebuf->i915)->clock_period_ns; \ + PRINT(&i915->drm, (ebuf), " Active context: %s[%d] prio %d, guilty %d " \ + "active %d, runtime total %lluns, avg %lluns\n", \ + ctx->comm, ctx->pid, ctx->sched_attr.priority, \ + ctx->guilty, ctx->active, \ + ctx->total_runtime * period, \ + mul_u32_u32(ctx->avg_runtime, period)); \ + } while (0) + +int intel_guc_capture_out_print_next_group(struct drm_i915_error_state_buf *ebuf, + struct intel_gt_coredump *gt) +{ + /* constant qualifier for data-pointers we shouldn't change mid of error dump printing */ + struct intel_guc_state_capture *cap = gt->uc->capture; + struct intel_guc *guc = container_of(cap, struct intel_guc, capture); + struct drm_i915_private *i915 = (container_of(guc, struct intel_gt, + uc.guc))->i915; + struct guc_capture_out_store *store; + struct guc_capture_out_store tmpstore; + struct guc_state_capture_group_header_t ghdr; + struct guc_state_capture_header_t hdr; + struct guc_mmio_reg reg; + const char *grptypestr[GUC_STATE_CAPTURE_GROUP_TYPE_MAX] = {"full-capture", + "partial-capture"}; + const char *datatypestr[GUC_CAPTURE_LIST_TYPE_MAX] = {"Global", "Engine-Class", + "Engine-Instance"}; + enum guc_capture_group_types grptype; + enum guc_capture_type datatype; + int numgrps, numregs, ret = 0; + const char *str; + char noname[16]; + u32 numbytes, guc_engclss, guc_enginst, guc_lrca, guc_gucid, is_ext; + struct intel_engine_cs *eng; + const struct intel_engine_coredump *ee; + const struct i915_gem_context_coredump *ctx; + struct i915_vma_coredump *batch; + + if (!cap->priv) + return -ENODEV; + + store = &cap->priv->out_store; + + mutex_lock(&store->lock); + smp_mb(); /* sync to get the latest head for the moment */ + /* NOTE1: make a copy of store so we dont have to deal with a changing lower bound of + * occupied-space in this circular buffer. + * NOTE2: Higher up the stack from here, we keep calling this function in a loop to + * reading more capture groups as they appear (as the lower bound of occupied-space + * changes) until this circ-buf is empty. + */ + memcpy(&tmpstore, store, sizeof(tmpstore)); + + PRINT(&i915->drm, ebuf, "global --- GuC Error Capture\n"); + + numbytes = CIRC_CNT(tmpstore.head, tmpstore.tail, tmpstore.size); + if (!numbytes) { + PRINT(&i915->drm, ebuf, "GuC err-capture parsing done\n"); + ret = -ENODATA; + goto unlock; + } + /* everything in GuC output structures are dword aligned */ + if (numbytes & 0x3) { + PRINT(&i915->drm, ebuf, "GuC capture stream unaligned!\n"); + ret = -EIO; + goto unlock; + } + + if (guc_capture_store_get_group_hdr(guc, &tmpstore, &numbytes, &ghdr)) { + PRINT(&i915->drm, ebuf, "GuC capture error getting next group-header!\n"); + ret = -EIO; + goto unlock; + } + + PRINT(&i915->drm, ebuf, "NumCaptures: 0x%08x\n", (uint32_t) + FIELD_GET(CAP_GRP_HDR_NUM_CAPTURES, ghdr.info)); + grptype = FIELD_GET(CAP_GRP_HDR_CAPTURE_TYPE, ghdr.info); + PRINT(&i915->drm, ebuf, "Coverage: 0x%08x = %s\n", grptype, + grptypestr[grptype % GUC_STATE_CAPTURE_GROUP_TYPE_MAX]); + + numgrps = FIELD_GET(CAP_GRP_HDR_NUM_CAPTURES, ghdr.info); + while (numgrps--) { + if (guc_capture_store_get_data_hdr(guc, &tmpstore, &numbytes, &hdr)) { + PRINT(&i915->drm, ebuf, "GuC capture error on next capture-header!\n"); + ret = -EIO; + goto unlock; + } + datatype = FIELD_GET(CAP_HDR_CAPTURE_TYPE, hdr.info); + PRINT(&i915->drm, ebuf, " RegListType: %s\n", + datatypestr[datatype % GUC_CAPTURE_LIST_TYPE_MAX]); + + eng = NULL; + guc_engclss = 0xffffffff; + guc_enginst = 0xffffffff; + guc_gucid = guc_lrca = 0; + guc_engclss = FIELD_GET(CAP_HDR_ENGINE_CLASS, hdr.info); + if (datatype != GUC_CAPTURE_LIST_TYPE_GLOBAL) { + PRINT(&i915->drm, ebuf, " GuC-Engine-Class: %d\n", + guc_engclss); + if (datatype == GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS && + guc_engclss <= GUC_LAST_ENGINE_CLASS) + PRINT(&i915->drm, ebuf, " i915-Eng-Class: %d\n", + guc_class_to_engine_class(guc_engclss)); + + if (datatype == GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE) { + guc_enginst = FIELD_GET(CAP_HDR_ENGINE_INSTANCE, hdr.info); + eng = guc_capture_lookup_engine(guc, guc_engclss, guc_enginst); + if (eng) + GCAP_PRINT_INTEL_ENG_INFO(i915, ebuf, eng); + else + PRINT(&i915->drm, ebuf, + " i915-Eng-Lookup Fail!\n"); + guc_lrca = hdr.lrca; + guc_gucid = hdr.guc_id; + GCAP_PRINT_GUC_INST_INFO(i915, ebuf, hdr); + } + } + numregs = FIELD_GET(CAP_HDR_NUM_MMIOS, hdr.num_mmios); + PRINT(&i915->drm, ebuf, " NumRegs: %d\n", numregs); + + while (numregs--) { + if (guc_capture_store_get_register(guc, &tmpstore, &numbytes, ®)) { + PRINT(&i915->drm, ebuf, "Error getting next register!\n"); + ret = -EIO; + goto unlock; + } + str = REGSTR(guc, GUC_CAPTURE_LIST_INDEX_PF, datatype, + guc_engclss, 0, reg.offset, &is_ext); + if (!str) { + snprintf(noname, sizeof(noname), "REG-0x%08x", reg.offset); + PRINT(&i915->drm, ebuf, " %s", noname); + } else { + PRINT(&i915->drm, ebuf, " %s", str); + } + if (is_ext) + PRINT(&i915->drm, ebuf, "[%ld][%ld]", + FIELD_GET(GUC_REGSET_STEERING_GROUP, reg.flags), + FIELD_GET(GUC_REGSET_STEERING_INSTANCE, reg.flags)); + PRINT(&i915->drm, ebuf, ": 0x%08x\n", reg.value); + } + for (ee = gt->engine; ee; ee = ee->next) { + const struct i915_vma_coredump *vma; + + if (ee->engine == eng && + guc_enginst == GUC_ID_TO_ENGINE_INSTANCE(ee->gucinfo.eng_id) && + guc_engclss == GUC_ID_TO_ENGINE_CLASS(ee->gucinfo.eng_id) && + ee->gucinfo.guc_id == guc_gucid && + (ee->gucinfo.lrca & CTX_GTT_ADDRESS_MASK) == + (guc_lrca & CTX_GTT_ADDRESS_MASK)) { + PRINT(&i915->drm, ebuf, "i915-Ctx-VMA-Matched:\n"); + GCAP_PRINT_BATCH(i915, ebuf, ee, batch); + PRINT(&i915->drm, ebuf, " engine reset count: %u\n", + ee->reset_count); + ctx = &ee->context; + GCAP_PRINT_CONTEXT(i915, ebuf, ctx); + + for (vma = ee->vma; vma; vma = vma->next) + intel_gpu_error_print_vma(ebuf, ee->engine, vma); + } + } + } + + store->tail = tmpstore.tail; +unlock: + /* if we have a stream error, just drop everything */ + if (ret == -EIO) { + drm_warn(&i915->drm, "Skip GuC capture header print due to stream error\n"); + guc_capture_store_drop_data(store, tmpstore.head); + } + + mutex_unlock(&store->lock); + + return ret; +} + +#undef REGSTR +#undef PRINT + +#endif //CONFIG_DRM_I915_DEBUG_GUC + static void guc_capture_store_insert(struct intel_guc *guc, struct guc_capture_out_store *store, unsigned char *new_data, size_t bytes) { @@ -846,6 +1261,30 @@ void intel_guc_capture_destroy(struct intel_guc *guc) guc->capture.priv = NULL; } +void intel_guc_capture_copy_info(struct intel_engine_coredump *ee, struct intel_context *ce) +{ + if (!ee || !ce) + return; + /* + * Store GuC relatable information pertaining to the faulting + * context into the intel_engine_coredump structure that we can + * reference later during the debugfs triggered printout function + * to ensure we print the vma dumps matching that match + * the GuC register dumps + */ + ee->gucinfo.lrca = ce->lrc.lrca; + ee->gucinfo.guc_id = ce->guc_id.id; + ee->gucinfo.eng_id = ee->engine->guc_id; +} + +struct intel_guc_state_capture * +intel_guc_capture_store_ptr(struct intel_guc *guc) +{ + if (!guc->capture.priv) + return NULL; + return &guc->capture; +} + int intel_guc_capture_init(struct intel_guc *guc) { int ret; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h index c240a4cc046b..37e29f76cda8 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h @@ -8,15 +8,23 @@ #include <linux/types.h> -struct intel_guc; +struct drm_i915_error_state_buf; struct guc_ads; struct guc_gt_system_info; +struct intel_gt_coredump; +struct intel_guc; +struct intel_engine_coredump; +struct intel_context; int intel_guc_capture_prep_lists(struct intel_guc *guc, struct guc_ads *blob, u32 blob_ggtt, u32 capture_offset, struct guc_gt_system_info *sysinfo); +int intel_guc_capture_out_print_next_group(struct drm_i915_error_state_buf *m, + struct intel_gt_coredump *gt); +void intel_guc_capture_copy_info(struct intel_engine_coredump *ee, struct intel_context *ce); void intel_guc_capture_store_snapshot(struct intel_guc *guc); int intel_guc_capture_output_min_size_est(struct intel_guc *guc); void intel_guc_capture_destroy(struct intel_guc *guc); +struct intel_guc_state_capture *intel_guc_capture_store_ptr(struct intel_guc *guc); int intel_guc_capture_init(struct intel_guc *guc); #endif /* _INTEL_GUC_CAPTURE_H */ diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 67f3515f07e7..4eeab55b4314 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -526,8 +526,8 @@ __find_vma(struct i915_vma_coredump *vma, const char *name) return NULL; } -static struct i915_vma_coredump * -find_batch(const struct intel_engine_coredump *ee) +struct i915_vma_coredump * +intel_gpu_error_find_batch(const struct intel_engine_coredump *ee) { return __find_vma(ee->vma, "batch"); } @@ -555,7 +555,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, error_print_instdone(m, ee); - batch = find_batch(ee); + batch = intel_gpu_error_find_batch(ee); if (batch) { u64 start = batch->gtt_offset; u64 end = start + batch->gtt_size; @@ -601,6 +601,16 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, error_print_context(m, " Active context: ", &ee->context); } +static void error_print_guc_captures(struct drm_i915_error_state_buf *m, + struct intel_gt_coredump *gt) +{ + int ret; + + do { + ret = intel_guc_capture_out_print_next_group(m, gt); + } while (!ret); +} + void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) { va_list args; @@ -610,9 +620,9 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) va_end(args); } -static void print_error_vma(struct drm_i915_error_state_buf *m, - const struct intel_engine_cs *engine, - const struct i915_vma_coredump *vma) +void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m, + const struct intel_engine_cs *engine, + const struct i915_vma_coredump *vma) { char out[ASCII85_BUFSZ]; struct page *page; @@ -681,7 +691,7 @@ static void err_print_uc(struct drm_i915_error_state_buf *m, intel_uc_fw_dump(&error_uc->guc_fw, &p); intel_uc_fw_dump(&error_uc->huc_fw, &p); - print_error_vma(m, NULL, error_uc->guc_log); + intel_gpu_error_print_vma(m, NULL, error_uc->guc_log); } static void err_free_sgl(struct scatterlist *sgl) @@ -766,12 +776,17 @@ static void err_print_gt(struct drm_i915_error_state_buf *m, err_printf(m, " GAM_DONE: 0x%08x\n", gt->gam_done); } - for (ee = gt->engine; ee; ee = ee->next) { - const struct i915_vma_coredump *vma; + if (gt->uc && gt->uc->capture) { + /* error capture was via GuC */ + error_print_guc_captures(m, gt); + } else { + for (ee = gt->engine; ee; ee = ee->next) { + const struct i915_vma_coredump *vma; - error_print_engine(m, ee); - for (vma = ee->vma; vma; vma = vma->next) - print_error_vma(m, ee->engine, vma); + error_print_engine(m, ee); + for (vma = ee->vma; vma; vma = vma->next) + intel_gpu_error_print_vma(m, ee->engine, vma); + } } if (gt->uc) @@ -1146,7 +1161,7 @@ static void gt_record_fences(struct intel_gt_coredump *gt) gt->nfence = i; } -static void engine_record_registers(struct intel_engine_coredump *ee) +static void engine_record_registers_execlist(struct intel_engine_coredump *ee) { const struct intel_engine_cs *engine = ee->engine; struct drm_i915_private *i915 = engine->i915; @@ -1443,8 +1458,10 @@ intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp) ee->engine = engine; - engine_record_registers(ee); - engine_record_execlists(ee); + if (!intel_uc_uses_guc_submission(&engine->gt->uc)) { + engine_record_registers_execlist(ee); + engine_record_execlists(ee); + } return ee; } @@ -1515,11 +1532,14 @@ capture_engine(struct intel_engine_cs *engine, struct intel_context *ce; struct i915_request *rq = NULL; unsigned long flags; + bool guc_submission = false; ee = intel_engine_coredump_alloc(engine, GFP_KERNEL); if (!ee) return NULL; + guc_submission = intel_uc_uses_guc_submission(&engine->gt->uc); + ce = intel_engine_get_hung_context(engine); if (ce) { intel_engine_clear_hung_context(engine); @@ -1531,7 +1551,7 @@ capture_engine(struct intel_engine_cs *engine, * Getting here with GuC enabled means it is a forced error capture * with no actual hang. So, no need to attempt the execlist search. */ - if (!intel_uc_uses_guc_submission(&engine->gt->uc)) { + if (!guc_submission) { spin_lock_irqsave(&engine->sched_engine->lock, flags); rq = intel_engine_execlist_find_hung_request(engine); spin_unlock_irqrestore(&engine->sched_engine->lock, @@ -1549,6 +1569,8 @@ capture_engine(struct intel_engine_cs *engine, i915_request_put(rq); goto no_request_capture; } + if (guc_submission) + intel_guc_capture_copy_info(ee, ce); intel_engine_coredump_add_vma(ee, capture, compress); i915_request_put(rq); @@ -1617,8 +1639,8 @@ gt_record_uc(struct intel_gt_coredump *gt, return error_uc; } -/* Capture all registers which don't fit into another category. */ -static void gt_record_regs(struct intel_gt_coredump *gt) +/* Capture all global registers which don't fit into another category. */ +static void gt_record_registers_execlist(struct intel_gt_coredump *gt) { struct intel_uncore *uncore = gt->_gt->uncore; struct drm_i915_private *i915 = uncore->i915; @@ -1862,7 +1884,9 @@ intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp) gc->_gt = gt; gc->awake = intel_gt_pm_is_awake(gt); - gt_record_regs(gc); + if (!intel_uc_uses_guc_submission(>->uc)) + gt_record_registers_execlist(gc); + gt_record_fences(gc); return gc; @@ -1927,6 +1951,9 @@ __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask) if (INTEL_INFO(i915)->has_gt_uc) error->gt->uc = gt_record_uc(error->gt, compress); + if (intel_uc_uses_guc_submission(>->uc)) + error->gt->uc->capture = intel_guc_capture_store_ptr(>->uc.guc); + i915_vma_capture_finish(error->gt, compress); error->simulated |= error->gt->simulated; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 5aedf5129814..576677c2888e 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -17,6 +17,7 @@ #include "gt/intel_engine.h" #include "gt/intel_gt_types.h" #include "gt/uc/intel_uc_fw.h" +#include "gt/uc/intel_guc_capture.h" #include "intel_device_info.h" @@ -84,6 +85,13 @@ struct intel_engine_coredump { u32 rc_psmi; /* sleep state */ struct intel_instdone instdone; + /* GuC correlated info */ + struct { + u32 lrca; + u16 guc_id; + u32 eng_id; + } gucinfo; + struct i915_gem_context_coredump { char comm[TASK_COMM_LEN]; @@ -149,6 +157,7 @@ struct intel_gt_coredump { struct intel_uc_fw guc_fw; struct intel_uc_fw huc_fw; struct i915_vma_coredump *guc_log; + struct intel_guc_state_capture *capture; } *uc; struct intel_gt_coredump *next; @@ -214,6 +223,11 @@ struct drm_i915_error_state_buf { __printf(2, 3) void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); +void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m, + const struct intel_engine_cs *engine, + const struct i915_vma_coredump *vma); +struct i915_vma_coredump * +intel_gpu_error_find_batch(const struct intel_engine_coredump *ee); struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask); -- 2.25.1