Quoting Matthew Auld (2019-05-07 11:55:57) > The plan is to use the blitter engine for async object clearing when > using local memory, but before we can move the worker to get_pages() we > have to first tame some more of our struct_mutex usage. With this in > mind we should be able to upstream the object clearing as some > selftests, which should serve as a guinea pig for the ongoing locking > rework and upcoming asyc get_pages() framework. > > Signed-off-by: Matthew Auld <matthew.auld@xxxxxxxxx> > --- > +struct clear_pages_work { > + struct dma_fence dma; > + struct dma_fence_cb cb; > + struct i915_sw_fence wait; > + struct work_struct work; > + struct irq_work irq_work; > + struct i915_sleeve *sleeve; > + struct intel_context *ce; > + u32 value; > +}; > + > +static const char *clear_pages_work_driver_name(struct dma_fence *fence) > +{ > + return DRIVER_NAME; > +} > + > +static const char *clear_pages_work_timeline_name(struct dma_fence *fence) > +{ > + return "clear"; > +} > + > +static void clear_pages_work_release(struct dma_fence *fence) > +{ > + struct clear_pages_work *w = container_of(fence, typeof(*w), dma); > + > + destroy_sleeve(w->sleeve); > + > + i915_sw_fence_fini(&w->wait); > + > + BUILD_BUG_ON(offsetof(typeof(*w), dma)); > + dma_fence_free(&w->dma); > +} > + > +static const struct dma_fence_ops clear_pages_work_ops = { > + .get_driver_name = clear_pages_work_driver_name, > + .get_timeline_name = clear_pages_work_timeline_name, > + .release = clear_pages_work_release, > +}; > + > +static void clear_pages_signal_irq_worker(struct irq_work *work) > +{ > + struct clear_pages_work *w = container_of(work, typeof(*w), irq_work); > + > + dma_fence_signal(&w->dma); > + dma_fence_put(&w->dma); > +} > + > +static void clear_pages_dma_fence_cb(struct dma_fence *fence, > + struct dma_fence_cb *cb) > +{ > + struct clear_pages_work *w = container_of(cb, typeof(*w), cb); > + > + /* > + * Push the signalling of the fence into yet another worker to avoid > + * the nightmare locking around the fence spinlock. > + */ > + irq_work_queue(&w->irq_work); > +} > + > +static void clear_pages_worker(struct work_struct *work) > +{ > + struct clear_pages_work *w = container_of(work, typeof(*w), work); > + struct drm_i915_private *i915 = w->ce->gem_context->i915; > + struct drm_i915_gem_object *obj = w->sleeve->obj; > + struct i915_vma *vma = w->sleeve->vma; > + struct i915_request *rq; > + int err; > + > + if (w->dma.error) > + goto out_signal; > + > + if (obj->cache_dirty) { > + obj->write_domain = 0; > + if (i915_gem_object_has_struct_page(obj)) > + drm_clflush_sg(w->sleeve->pages); > + obj->cache_dirty = false; Interesting. If we have no struct_page, can we be cache_dirty here? That might be a useful thought exercise and worth verifying at odd points. > + } > + > + mutex_lock(&i915->drm.struct_mutex); This will be come vm->mutex. But that's why we need this patch so that we can trim down the locking with a working test. > + err = i915_vma_pin(vma, 0, 0, PIN_USER); > + if (unlikely(err)) { > + mutex_unlock(&i915->drm.struct_mutex); > + dma_fence_set_error(&w->dma, err); > + goto out_signal; > + } > + > + rq = i915_request_create(w->ce); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto out_unpin; > + } > + > + err = intel_emit_vma_fill_blt(rq, vma, w->value); > + if (unlikely(err)) > + goto out_request; > + > + err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); > +out_request: > + if (unlikely(err)) > + i915_request_skip(rq, err); > + else > + i915_request_get(rq); > + > + i915_request_add(rq); > +out_unpin: > + i915_vma_unpin(vma); > + > + mutex_unlock(&i915->drm.struct_mutex); > + > + if (!err) { > + err = dma_fence_add_callback(&rq->fence, &w->cb, > + clear_pages_dma_fence_cb); > + i915_request_put(rq); > + if (!err) > + return; This should be rearranged such that after we have a rq allocated, we always attach the callback and propagate via the callback. Even on the error path. That should tidy this up quite a bit. (It's pretty much the point of why we always i915_request_add even when skipping, we always have an intact timeline for conveying errors.) > + } else { > + dma_fence_set_error(&w->dma, err); > + } > +out_signal: > + dma_fence_signal(&w->dma); > + dma_fence_put(&w->dma); > +} > + > +static int __i915_sw_fence_call > +clear_pages_work_notify(struct i915_sw_fence *fence, > + enum i915_sw_fence_notify state) > +{ > + struct clear_pages_work *w = container_of(fence, typeof(*w), wait); > + > + switch (state) { > + case FENCE_COMPLETE: > + schedule_work(&w->work); > + break; > + > + case FENCE_FREE: > + dma_fence_put(&w->dma); > + break; > + } > + > + return NOTIFY_DONE; > +} > + > +static DEFINE_SPINLOCK(fence_lock); > + > +int i915_gem_schedule_fill_pages_blt(struct drm_i915_gem_object *obj, Not sold on this name. Scheduling is inherent in the name GEM, and this takes the i915_gem_object as its primary argument. I'd favour i915_gem_object_fill_blt, though it's really part of the mman family. To be resolved later. > + struct intel_context *ce, > + struct sg_table *pages, > + struct i915_page_sizes *page_sizes, > + u32 value) > +{ > + struct drm_i915_private *i915 = to_i915(obj->base.dev); > + struct i915_gem_context *ctx = ce->gem_context; > + struct i915_address_space *vm; > + struct clear_pages_work *work; > + struct i915_sleeve *sleeve; > + int err; > + > + vm = ctx->ppgtt ? &ctx->ppgtt->vm : &i915->ggtt.vm; Remind me, this needs to be ce->vm. > + sleeve = create_sleeve(vm, obj, pages, page_sizes); > + if (IS_ERR(sleeve)) > + return PTR_ERR(sleeve); > + > + work = kmalloc(sizeof(*work), GFP_KERNEL); > + if (work == NULL) { > + destroy_sleeve(sleeve); > + return -ENOMEM; > + } > + > + work->value = value; > + work->sleeve = sleeve; > + work->ce = ce; > + > + INIT_WORK(&work->work, clear_pages_worker); > + > + init_irq_work(&work->irq_work, clear_pages_signal_irq_worker); > + > + dma_fence_init(&work->dma, > + &clear_pages_work_ops, > + &fence_lock, > + i915->mm.unordered_timeline, > + 0); > + i915_sw_fence_init(&work->wait, clear_pages_work_notify); > + > + i915_gem_object_lock(obj); > + err = i915_sw_fence_await_reservation(&work->wait, > + obj->resv, NULL, > + true, I915_FENCE_TIMEOUT, > + I915_FENCE_GFP); > + if (err < 0) { > + dma_fence_set_error(&work->dma, err); > + } else { > + reservation_object_add_excl_fence(obj->resv, &work->dma); > + err = 0; > + } > + i915_gem_object_unlock(obj); > + > + dma_fence_get(&work->dma); > + i915_sw_fence_commit(&work->wait); > + > + return err; > +} > + > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) > +#include "selftests/i915_gem_client_blt.c" > +#endif > diff --git a/drivers/gpu/drm/i915/i915_gem_client_blt.h b/drivers/gpu/drm/i915/i915_gem_client_blt.h > new file mode 100644 > index 000000000000..a7080623e741 > --- /dev/null > +++ b/drivers/gpu/drm/i915/i915_gem_client_blt.h > @@ -0,0 +1,21 @@ > +/* SPDX-License-Identifier: MIT */ > +/* > + * Copyright © 2019 Intel Corporation > + */ > +#ifndef __I915_GEM_CLIENT_BLT_H__ > +#define __I915_GEM_CLIENT_BLT_H__ > + > +#include <linux/types.h> > + > +struct drm_i915_gem_object; > +struct intel_context ce; > +struct i915_page_sizes; > +struct sg_table; > + > +int i915_gem_schedule_fill_pages_blt(struct drm_i915_gem_object *obj, > + struct intel_context *ce, > + struct sg_table *pages, > + struct i915_page_sizes *page_sizes, > + u32 value); > + > +#endif > diff --git a/drivers/gpu/drm/i915/i915_gem_object_blt.c b/drivers/gpu/drm/i915/i915_gem_object_blt.c > new file mode 100644 > index 000000000000..3fda33e5dcf5 > --- /dev/null > +++ b/drivers/gpu/drm/i915/i915_gem_object_blt.c > @@ -0,0 +1,103 @@ > +// SPDX-License-Identifier: MIT > +/* > + * Copyright © 2019 Intel Corporation > + */ > + > +#include "i915_gem_object_blt.h" > + > +#include "i915_gem_clflush.h" > +#include "intel_drv.h" > + > +int intel_emit_vma_fill_blt(struct i915_request *rq, > + struct i915_vma *vma, > + u32 value) > +{ > + struct intel_context *ce = rq->hw_context; > + u32 *cs; > + int err; > + > + if (ce->engine->emit_init_breadcrumb) { > + err = ce->engine->emit_init_breadcrumb(rq); > + if (unlikely(err)) > + return err; > + } Though it may push some duplication in the callers, this is the callers duty. > + cs = intel_ring_begin(rq, 8); if (IS_ERR(cs)) return cs; > + > + if (INTEL_GEN(rq->i915) >= 8) { > + *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7-2); > + *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE; > + *cs++ = 0; > + *cs++ = vma->size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; > + *cs++ = lower_32_bits(vma->node.start); > + *cs++ = upper_32_bits(vma->node.start); > + *cs++ = value; > + *cs++ = MI_NOOP; > + } else { > + *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6-2); > + *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE; > + *cs++ = 0; > + *cs++ = vma->size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; > + *cs++ = vma->node.start; > + *cs++ = value; > + *cs++ = MI_NOOP; > + *cs++ = MI_NOOP; > + } > + > + intel_ring_advance(rq, cs); > + > + return 0; > +} > + > +int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj, > + struct intel_context *ce, > + u32 value) > +{ > + struct drm_i915_private *i915 = to_i915(obj->base.dev); > + struct i915_gem_context *ctx = ce->gem_context; > + struct i915_address_space *vm; > + struct i915_request *rq; > + struct i915_vma *vma; > + int err; > + > + vm = ctx->ppgtt ? &ctx->ppgtt->vm : &i915->ggtt.vm; > + > + vma = i915_vma_instance(obj, vm, NULL); > + if (IS_ERR(vma)) > + return PTR_ERR(vma); > + > + err = i915_vma_pin(vma, 0, 0, PIN_USER); > + if (unlikely(err)) > + return err; > + > + if (obj->cache_dirty) if (obj->cache_dirty & ~obj->cache_coherent) > + i915_gem_clflush_object(obj, 0); > + > + rq = i915_request_create(ce); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto out_unpin; > + } > + > + err = i915_request_await_object(rq, obj, true); > + if (unlikely(err)) > + goto out_request; > + > + err = intel_emit_vma_fill_blt(rq, vma, value); > + if (unlikely(err)) > + goto out_request; > + > + err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); > +out_request: > + if (unlikely(err)) > + i915_request_skip(rq, err); > + > + i915_request_add(rq); > +out_unpin: > + i915_vma_unpin(vma); > + return err; Ok. > +} > + > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) > +#include "selftests/i915_gem_object_blt.c" > +#endif > diff --git a/drivers/gpu/drm/i915/i915_gem_object_blt.h b/drivers/gpu/drm/i915/i915_gem_object_blt.h > new file mode 100644 > index 000000000000..7ec7de6ac0c0 > --- /dev/null > +++ b/drivers/gpu/drm/i915/i915_gem_object_blt.h > @@ -0,0 +1,24 @@ > +/* SPDX-License-Identifier: MIT */ > +/* > + * Copyright © 2019 Intel Corporation > + */ > + > +#ifndef __I915_GEM_OBJECT_BLT_H__ > +#define __I915_GEM_OBJECT_BLT_H__ > + > +#include <linux/types.h> > + > +struct drm_i915_gem_object; > +struct intel_context; > +struct i915_request; > +struct i915_vma; > + > +int intel_emit_vma_fill_blt(struct i915_request *rq, > + struct i915_vma *vma, > + u32 value); > + > +int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj, > + struct intel_context *ce, > + u32 value); > + > +#endif > diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_client_blt.c b/drivers/gpu/drm/i915/selftests/i915_gem_client_blt.c > new file mode 100644 > index 000000000000..54b15d22e310 > --- /dev/null > +++ b/drivers/gpu/drm/i915/selftests/i915_gem_client_blt.c > @@ -0,0 +1,131 @@ > +// SPDX-License-Identifier: MIT > +/* > + * Copyright © 2019 Intel Corporation > + */ > + > +#include "../i915_selftest.h" > + > +#include "igt_flush_test.h" > +#include "mock_drm.h" > +#include "mock_context.h" > + > +static int igt_client_fill(void *arg) > +{ > + struct intel_context *ce = arg; > + struct drm_i915_private *i915 = ce->gem_context->i915; > + struct drm_i915_gem_object *obj; > + struct rnd_state prng; > + IGT_TIMEOUT(end); > + u32 *vaddr; > + int err = 0; > + > + prandom_seed_state(&prng, i915_selftest.random_seed); > + > + do { > + u32 sz = prandom_u32_state(&prng) % SZ_32M; > + u32 val = prandom_u32_state(&prng); > + u32 i; > + > + sz = round_up(sz, PAGE_SIZE); > + > + pr_info("%s with sz=%x, val=%x\n", __func__, sz, val); pr_debug ? Won't this be quite frequent? > + obj = i915_gem_object_create_internal(i915, sz); > + if (IS_ERR(obj)) { > + err = PTR_ERR(obj); > + goto err_flush; > + } > + > + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB); > + if (IS_ERR(vaddr)) { > + err = PTR_ERR(vaddr); > + goto err_put; > + } > + > + /* > + * XXX: The goal is move this to get_pages, so try to dirty the > + * CPU cache first to check that we do the required clflush > + * before scheduling the blt for !llc platforms. This matches > + * some version of reality where at get_pages the pages > + * themselves may not yet be coherent with the GPU(swap-in). If > + * we are missing the flush then we should see the stale cache > + * values after we do the set_to_cpu_domain and pick it up as a > + * test failure. > + */ > + memset32(vaddr, val ^ 0xdeadbeaf, obj->base.size / sizeof(u32)); > + > + if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)) > + obj->cache_dirty = true; > + > + err = i915_gem_schedule_fill_pages_blt(obj, ce, obj->mm.pages, > + &obj->mm.page_sizes, > + val); > + if (err) > + goto err_unpin; > + > + /* > + * XXX: For now do the wait without the BKL to ensure we don't > + * deadlock. > + */ > + err = i915_gem_object_wait(obj, > + I915_WAIT_INTERRUPTIBLE | > + I915_WAIT_ALL, > + MAX_SCHEDULE_TIMEOUT); > + if (err) > + goto err_unpin; > + > + mutex_lock(&i915->drm.struct_mutex); > + err = i915_gem_object_set_to_cpu_domain(obj, false); > + mutex_unlock(&i915->drm.struct_mutex); > + if (err) > + goto err_unpin; > + > + for (i = 0; i < obj->base.size / sizeof(u32); ++i) { > + if (vaddr[i] != val) { > + pr_err("vaddr[%u]=%x, expected=%x\n", i, > + vaddr[i], val); > + err = -EINVAL; > + goto err_unpin; > + } > + } > + > + i915_gem_object_unpin_map(obj); > + > + mutex_lock(&i915->drm.struct_mutex); > + __i915_gem_object_release_unless_active(obj); > + mutex_unlock(&i915->drm.struct_mutex); > + } while (!time_after(jiffies, end)); > + > + goto err_flush; > + > +err_unpin: > + i915_gem_object_unpin_map(obj); > +err_put: > + mutex_lock(&i915->drm.struct_mutex); > + __i915_gem_object_release_unless_active(obj); > + mutex_unlock(&i915->drm.struct_mutex); > +err_flush: > + mutex_lock(&i915->drm.struct_mutex); > + igt_flush_test(i915, I915_WAIT_LOCKED); if (igt...) err = -EIO; When it fails, we are wedged, so promote the result to an -EIO. > + mutex_unlock(&i915->drm.struct_mutex); > + > + if (err == -ENOMEM) > + err = 0; > + > + return err; > +} So much work to do to reduce lock coverage so that we can emit requests from inside obj->mm.lock. This code is very much a WIP and not near ready for actual use, but serves a very, very useful purpose in providing a test bed for incremental improvements. -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx