Trust that we now have adequate protection over the low level structures via the engine->active.lock to allow ourselves to capture the GPU error state without the heavy hammer of stop_machine(). Sadly this does mean that we have to forgo some of the lesser used information (not derived from the active state) that is not controlled by the active locks. A useful side-effect is that this allows us to restore error capturing for Braswell and Broxton. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_gem_gtt.c | 5 - drivers/gpu/drm/i915/i915_gpu_error.c | 401 ++++++++------------------ drivers/gpu/drm/i915/i915_gpu_error.h | 16 - 3 files changed, 117 insertions(+), 305 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index 236c964dd761..e0645ce4fb84 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -3135,11 +3135,6 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt) ggtt->vm.insert_page = bxt_vtd_ggtt_insert_page__BKL; if (ggtt->vm.clear_range != nop_clear_range) ggtt->vm.clear_range = bxt_vtd_ggtt_clear_range__BKL; - - /* Prevent recursively calling stop_machine() and deadlocks. */ - dev_info(dev_priv->drm.dev, - "Disabling error capture for VT-d workaround\n"); - i915_disable_error_state(dev_priv, -ENODEV); } ggtt->invalidate = gen6_ggtt_invalidate; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index f297a43df1e9..2af883dccfc7 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -30,7 +30,6 @@ #include <linux/ascii85.h> #include <linux/nmi.h> #include <linux/scatterlist.h> -#include <linux/stop_machine.h> #include <linux/utsname.h> #include <linux/zlib.h> @@ -46,6 +45,9 @@ #include "i915_scatterlist.h" #include "intel_csr.h" +#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) +#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN) + static inline const struct intel_engine_cs * engine_lookup(const struct drm_i915_private *i915, unsigned int id) { @@ -67,26 +69,6 @@ engine_name(const struct drm_i915_private *i915, unsigned int id) return __engine_name(engine_lookup(i915, id)); } -static const char *tiling_flag(int tiling) -{ - switch (tiling) { - default: - case I915_TILING_NONE: return ""; - case I915_TILING_X: return " X"; - case I915_TILING_Y: return " Y"; - } -} - -static const char *dirty_flag(int dirty) -{ - return dirty ? " dirty" : ""; -} - -static const char *purgeable_flag(int purgeable) -{ - return purgeable ? " purgeable" : ""; -} - static void __sg_set_buf(struct scatterlist *sg, void *addr, unsigned int len, loff_t it) { @@ -114,7 +96,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) if (e->cur == e->end) { struct scatterlist *sgl; - sgl = (typeof(sgl))__get_free_page(GFP_KERNEL); + sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL); if (!sgl) { e->err = -ENOMEM; return false; @@ -134,7 +116,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) } e->size = ALIGN(len + 1, SZ_64K); - e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + e->buf = kmalloc(e->size, ALLOW_FAIL); if (!e->buf) { e->size = PAGE_ALIGN(len + 1); e->buf = kmalloc(e->size, GFP_KERNEL); @@ -220,26 +202,32 @@ struct compress { static bool compress_init(struct compress *c) { - struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream)); + struct z_stream_s *zstream = &c->zstream; zstream->workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), - GFP_ATOMIC | __GFP_NOWARN); + ALLOW_FAIL); if (!zstream->workspace) return false; - if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) { - kfree(zstream->workspace); - return false; - } - c->tmp = NULL; if (i915_has_memcpy_from_wc()) - c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN); + c->tmp = (void *)__get_free_page(ALLOW_FAIL); return true; } +static bool compress_start(struct compress *c) +{ + struct z_stream_s *zstream = &c->zstream; + void *workspace = zstream->workspace; + + memset(zstream, 0, sizeof(*zstream)); + zstream->workspace = workspace; + + return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK; +} + static void *compress_next_page(struct drm_i915_error_object *dst) { unsigned long page; @@ -247,7 +235,7 @@ static void *compress_next_page(struct drm_i915_error_object *dst) if (dst->page_count >= dst->num_pages) return ERR_PTR(-ENOSPC); - page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); + page = __get_free_page(ATOMIC_MAYFAIL); if (!page) return ERR_PTR(-ENOMEM); @@ -316,13 +304,14 @@ static int compress_flush(struct compress *c, return 0; } -static void compress_fini(struct compress *c, - struct drm_i915_error_object *dst) +static void compress_finish(struct compress *c) { - struct z_stream_s *zstream = &c->zstream; + zlib_deflateEnd(&c->zstream); +} - zlib_deflateEnd(zstream); - kfree(zstream->workspace); +static void compress_fini(struct compress *c) +{ + kfree(c->zstream.workspace); if (c->tmp) free_page((unsigned long)c->tmp); } @@ -342,6 +331,11 @@ static bool compress_init(struct compress *c) return true; } +static bool compress_start(struct compress *c) +{ + return true; +} + static int compress_page(struct compress *c, void *src, struct drm_i915_error_object *dst) @@ -349,7 +343,7 @@ static int compress_page(struct compress *c, unsigned long page; void *ptr; - page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); + page = __get_free_page(ATOMIC_MAYFAIL); if (!page) return -ENOMEM; @@ -367,8 +361,11 @@ static int compress_flush(struct compress *c, return 0; } -static void compress_fini(struct compress *c, - struct drm_i915_error_object *dst) +static void compress_finish(struct compress *c) +{ +} + +static void compress_fini(struct compress *c) { } @@ -379,36 +376,6 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m) #endif -static void print_error_buffers(struct drm_i915_error_state_buf *m, - const char *name, - struct drm_i915_error_buffer *err, - int count) -{ - err_printf(m, "%s [%d]:\n", name, count); - - while (count--) { - err_printf(m, " %08x_%08x %8u %02x %02x", - upper_32_bits(err->gtt_offset), - lower_32_bits(err->gtt_offset), - err->size, - err->read_domains, - err->write_domain); - err_puts(m, tiling_flag(err->tiling)); - err_puts(m, dirty_flag(err->dirty)); - err_puts(m, purgeable_flag(err->purgeable)); - err_puts(m, err->userptr ? " userptr" : ""); - err_puts(m, i915_cache_level_str(m->i915, err->cache_level)); - - if (err->name) - err_printf(m, " (name: %d)", err->name); - if (err->fence_reg != I915_FENCE_REG_NONE) - err_printf(m, " (fence: %d)", err->fence_reg); - - err_puts(m, "\n"); - err++; - } -} - static void error_print_instdone(struct drm_i915_error_state_buf *m, const struct drm_i915_error_engine *ee) { @@ -734,33 +701,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, error_print_engine(m, &error->engine[i], error->epoch); } - for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) { - char buf[128]; - int len, first = 1; - - if (!error->active_vm[i]) - break; - - len = scnprintf(buf, sizeof(buf), "Active ("); - for (j = 0; j < ARRAY_SIZE(error->engine); j++) { - if (error->engine[j].vm != error->active_vm[i]) - continue; - - len += scnprintf(buf + len, sizeof(buf), "%s%s", - first ? "" : ", ", - m->i915->engine[j]->name); - first = 0; - } - scnprintf(buf + len, sizeof(buf), ")"); - print_error_buffers(m, buf, - error->active_bo[i], - error->active_bo_count[i]); - } - - print_error_buffers(m, "Pinned (global)", - error->pinned_bo, - error->pinned_bo_count); - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { const struct drm_i915_error_engine *ee = &error->engine[i]; @@ -974,10 +914,6 @@ void __i915_gpu_state_free(struct kref *error_ref) kfree(ee->requests); } - for (i = 0; i < ARRAY_SIZE(error->active_bo); i++) - kfree(error->active_bo[i]); - kfree(error->pinned_bo); - kfree(error->overlay); kfree(error->display); @@ -990,12 +926,12 @@ void __i915_gpu_state_free(struct kref *error_ref) static struct drm_i915_error_object * i915_error_object_create(struct drm_i915_private *i915, - struct i915_vma *vma) + struct i915_vma *vma, + struct compress *compress) { struct i915_ggtt *ggtt = &i915->ggtt; const u64 slot = ggtt->error_capture.start; struct drm_i915_error_object *dst; - struct compress compress; unsigned long num_pages; struct sgt_iter iter; dma_addr_t dma; @@ -1006,22 +942,21 @@ i915_error_object_create(struct drm_i915_private *i915, num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */ - dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), - GFP_ATOMIC | __GFP_NOWARN); + dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ATOMIC_MAYFAIL); if (!dst) return NULL; + if (!compress_start(compress)) { + kfree(dst); + return NULL; + } + dst->gtt_offset = vma->node.start; dst->gtt_size = vma->node.size; dst->num_pages = num_pages; dst->page_count = 0; dst->unused = 0; - if (!compress_init(&compress)) { - kfree(dst); - return NULL; - } - ret = -EINVAL; for_each_sgt_dma(dma, iter, vma->pages) { void __iomem *s; @@ -1029,69 +964,23 @@ i915_error_object_create(struct drm_i915_private *i915, ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); - ret = compress_page(&compress, (void __force *)s, dst); + ret = compress_page(compress, (void __force *)s, dst); io_mapping_unmap_atomic(s); if (ret) break; } - if (ret || compress_flush(&compress, dst)) { + if (ret || compress_flush(compress, dst)) { while (dst->page_count--) free_page((unsigned long)dst->pages[dst->page_count]); kfree(dst); dst = NULL; } + compress_finish(compress); - compress_fini(&compress, dst); return dst; } -static void capture_bo(struct drm_i915_error_buffer *err, - struct i915_vma *vma) -{ - struct drm_i915_gem_object *obj = vma->obj; - - err->size = obj->base.size; - err->name = obj->base.name; - - err->gtt_offset = vma->node.start; - err->read_domains = obj->read_domains; - err->write_domain = obj->write_domain; - err->fence_reg = vma->fence ? vma->fence->id : -1; - err->tiling = i915_gem_object_get_tiling(obj); - err->dirty = obj->mm.dirty; - err->purgeable = obj->mm.madv != I915_MADV_WILLNEED; - err->userptr = obj->userptr.mm != NULL; - err->cache_level = obj->cache_level; -} - -static u32 capture_error_bo(struct drm_i915_error_buffer *err, - int count, struct list_head *head, - unsigned int flags) -#define ACTIVE_ONLY BIT(0) -#define PINNED_ONLY BIT(1) -{ - struct i915_vma *vma; - int i = 0; - - list_for_each_entry(vma, head, vm_link) { - if (!vma->obj) - continue; - - if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma)) - continue; - - if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma)) - continue; - - capture_bo(err++, vma); - if (++i == count) - break; - } - - return i; -} - /* * Generate a semi-unique error code. The code is not meant to have meaning, The * code's only purpose is to try to prevent false duplicated bug reports by @@ -1281,7 +1170,7 @@ static void engine_record_requests(struct intel_engine_cs *engine, if (!count) return; - ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC); + ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL); if (!ee->requests) return; @@ -1349,8 +1238,10 @@ static void record_context(struct drm_i915_error_context *e, e->active = atomic_read(&ctx->active_count); } -static void request_record_user_bo(struct i915_request *request, - struct drm_i915_error_engine *ee) +static void +request_record_user_bo(struct i915_request *request, + struct drm_i915_error_engine *ee, + struct compress *compress) { struct i915_capture_list *c; struct drm_i915_error_object **bo; @@ -1362,18 +1253,20 @@ static void request_record_user_bo(struct i915_request *request, if (!max) return; - bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); + bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); if (!bo) { /* If we can't capture everything, try to capture something. */ max = min_t(long, max, PAGE_SIZE / sizeof(*bo)); - bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); + bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); } if (!bo) return; count = 0; for (c = request->capture_list; c; c = c->next) { - bo[count] = i915_error_object_create(request->i915, c->vma); + bo[count] = i915_error_object_create(request->i915, + c->vma, + compress); if (!bo[count]) break; if (++count == max) @@ -1386,7 +1279,8 @@ static void request_record_user_bo(struct i915_request *request, static struct drm_i915_error_object * capture_object(struct drm_i915_private *dev_priv, - struct drm_i915_gem_object *obj) + struct drm_i915_gem_object *obj, + struct compress *compress) { if (obj && i915_gem_object_has_pages(obj)) { struct i915_vma fake = { @@ -1396,13 +1290,14 @@ capture_object(struct drm_i915_private *dev_priv, .obj = obj, }; - return i915_error_object_create(dev_priv, &fake); + return i915_error_object_create(dev_priv, &fake, compress); } else { return NULL; } } -static void gem_record_rings(struct i915_gpu_state *error) +static void +gem_record_rings(struct i915_gpu_state *error, struct compress *compress) { struct drm_i915_private *i915 = error->i915; int i; @@ -1438,17 +1333,21 @@ static void gem_record_rings(struct i915_gpu_state *error) * by userspace. */ ee->batchbuffer = - i915_error_object_create(i915, request->batch); + i915_error_object_create(i915, + request->batch, + compress); if (HAS_BROKEN_CS_TLB(i915)) ee->wa_batchbuffer = i915_error_object_create(i915, - engine->gt->scratch); - request_record_user_bo(request, ee); + engine->gt->scratch, + compress); + request_record_user_bo(request, ee, compress); ee->ctx = i915_error_object_create(i915, - request->hw_context->state); + request->hw_context->state, + compress); error->simulated |= i915_gem_context_no_error_capture(ctx); @@ -1460,7 +1359,9 @@ static void gem_record_rings(struct i915_gpu_state *error) ee->cpu_ring_head = ring->head; ee->cpu_ring_tail = ring->tail; ee->ringbuffer = - i915_error_object_create(i915, ring->vma); + i915_error_object_create(i915, + ring->vma, + compress); engine_record_requests(engine, request, ee); } @@ -1468,89 +1369,21 @@ static void gem_record_rings(struct i915_gpu_state *error) ee->hws_page = i915_error_object_create(i915, - engine->status_page.vma); + engine->status_page.vma, + compress); - ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma); - - ee->default_state = capture_object(i915, engine->default_state); - } -} - -static void gem_capture_vm(struct i915_gpu_state *error, - struct i915_address_space *vm, - int idx) -{ - struct drm_i915_error_buffer *active_bo; - struct i915_vma *vma; - int count; - - count = 0; - list_for_each_entry(vma, &vm->bound_list, vm_link) - if (i915_vma_is_active(vma)) - count++; - - active_bo = NULL; - if (count) - active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC); - if (active_bo) - count = capture_error_bo(active_bo, - count, &vm->bound_list, - ACTIVE_ONLY); - else - count = 0; - - error->active_vm[idx] = vm; - error->active_bo[idx] = active_bo; - error->active_bo_count[idx] = count; -} - -static void capture_active_buffers(struct i915_gpu_state *error) -{ - int cnt = 0, i, j; - - BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo)); - BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm)); - BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count)); - - /* Scan each engine looking for unique active contexts/vm */ - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - struct drm_i915_error_engine *ee = &error->engine[i]; - bool found; - - if (!ee->vm) - continue; + ee->wa_ctx = + i915_error_object_create(i915, + engine->wa_ctx.vma, + compress); - found = false; - for (j = 0; j < i && !found; j++) - found = error->engine[j].vm == ee->vm; - if (!found) - gem_capture_vm(error, ee->vm, cnt++); + ee->default_state = + capture_object(i915, engine->default_state, compress); } } -static void capture_pinned_buffers(struct i915_gpu_state *error) -{ - struct i915_address_space *vm = &error->i915->ggtt.vm; - struct drm_i915_error_buffer *bo; - struct i915_vma *vma; - int count; - - count = 0; - list_for_each_entry(vma, &vm->bound_list, vm_link) - count++; - - bo = NULL; - if (count) - bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC); - if (!bo) - return; - - error->pinned_bo_count = - capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY); - error->pinned_bo = bo; -} - -static void capture_uc_state(struct i915_gpu_state *error) +static void +capture_uc_state(struct i915_gpu_state *error, struct compress *compress) { struct drm_i915_private *i915 = error->i915; struct i915_error_uc *error_uc = &error->uc; @@ -1566,9 +1399,11 @@ static void capture_uc_state(struct i915_gpu_state *error) * As modparams are generally accesible from the userspace make * explicit copies of the firmware paths. */ - error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC); - error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC); - error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma); + error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, ALLOW_FAIL); + error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, ALLOW_FAIL); + error_uc->guc_log = i915_error_object_create(i915, + i915->guc.log.vma, + compress); } /* Capture all registers which don't fit into another category. */ @@ -1752,56 +1587,54 @@ static void capture_finish(struct i915_gpu_state *error) ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); } -static int capture(void *data) -{ - struct i915_gpu_state *error = data; - - error->time = ktime_get_real(); - error->boottime = ktime_get_boottime(); - error->uptime = ktime_sub(ktime_get(), - error->i915->gt.last_init_time); - error->capture = jiffies; - - capture_params(error); - capture_gen_state(error); - capture_uc_state(error); - capture_reg_state(error); - gem_record_fences(error); - gem_record_rings(error); - capture_active_buffers(error); - capture_pinned_buffers(error); - - error->overlay = intel_overlay_capture_error_state(error->i915); - error->display = intel_display_capture_error_state(error->i915); - - error->epoch = capture_find_epoch(error); - - capture_finish(error); - return 0; -} - #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) struct i915_gpu_state * i915_capture_gpu_state(struct drm_i915_private *i915) { struct i915_gpu_state *error; + struct compress compress; /* Check if GPU capture has been disabled */ error = READ_ONCE(i915->gpu_error.first_error); if (IS_ERR(error)) return error; - error = kzalloc(sizeof(*error), GFP_ATOMIC); + error = kzalloc(sizeof(*error), GFP_KERNEL | __GFP_NOWARN); if (!error) { i915_disable_error_state(i915, -ENOMEM); return ERR_PTR(-ENOMEM); } + if (!compress_init(&compress)) { + kfree(error); + i915_disable_error_state(i915, -ENOMEM); + return ERR_PTR(-ENOMEM); + } + kref_init(&error->ref); error->i915 = i915; - stop_machine(capture, error, NULL); + error->time = ktime_get_real(); + error->boottime = ktime_get_boottime(); + error->uptime = ktime_sub(ktime_get(), + error->i915->gt.last_init_time); + error->capture = jiffies; + + capture_params(error); + capture_gen_state(error); + capture_uc_state(error, &compress); + capture_reg_state(error); + gem_record_fences(error); + gem_record_rings(error, &compress); + + error->overlay = intel_overlay_capture_error_state(error->i915); + error->display = intel_display_capture_error_state(error->i915); + + error->epoch = capture_find_epoch(error); + + capture_finish(error); + compress_fini(&compress); return error; } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 2ecd0c6a1c94..bffa6db143e0 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -160,22 +160,6 @@ struct i915_gpu_state { } vm_info; } engine[I915_NUM_ENGINES]; - struct drm_i915_error_buffer { - u32 size; - u32 name; - u64 gtt_offset; - u32 read_domains; - u32 write_domain; - s32 fence_reg:I915_MAX_NUM_FENCE_BITS; - u32 tiling:2; - u32 dirty:1; - u32 purgeable:1; - u32 userptr:1; - u32 cache_level:3; - } *active_bo[I915_NUM_ENGINES], *pinned_bo; - u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count; - struct i915_address_space *active_vm[I915_NUM_ENGINES]; - struct scatterlist *sgl, *fit; }; -- 2.22.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx