This was the competing idea long ago, but it was only with the rewrite of the idr as an radixtree and using the radixtree directly ourselves, and the realisation that we can store the vma directly in the radixtree and only need a list for the reverse mapping, that made the patch performant enough to displace using a hashtable. Though the vma ht is fast and doesn't require and extra allocation (as we can embed the node inside the vma), it does require a thread for resizing and serialization. That is hairy enough to investigate alternative and favour them if equivalent in performance. One advantage of allocating an indirection entry is that we can support a single shared bo between many clients, something that was done on a first-come first-serve basis for shared GGTT vma previously. To offset the extra allocations, we create yet another kmem_cache for them. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_debugfs.c | 6 -- drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/i915_gem.c | 32 ++++++++-- drivers/gpu/drm/i915/i915_gem_context.c | 86 +++++---------------------- drivers/gpu/drm/i915/i915_gem_context.h | 30 ++-------- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 70 +++++++--------------- drivers/gpu/drm/i915/i915_gem_object.h | 10 +++- drivers/gpu/drm/i915/i915_vma.c | 22 ------- drivers/gpu/drm/i915/i915_vma.h | 4 -- drivers/gpu/drm/i915/selftests/mock_context.c | 15 ++--- lib/radix-tree.c | 1 + 11 files changed, 81 insertions(+), 196 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 4577b0af6886..a6ba2100bb88 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -1998,12 +1998,6 @@ static int i915_context_status(struct seq_file *m, void *unused) seq_putc(m, '\n'); } - seq_printf(m, - "\tvma hashtable size=%u (actual %lu), count=%u\n", - ctx->vma_lut.ht_size, - BIT(ctx->vma_lut.ht_bits), - ctx->vma_lut.ht_count); - seq_putc(m, '\n'); } diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index cd15697c1ca4..de8dd0c06ce6 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2083,6 +2083,7 @@ struct drm_i915_private { struct kmem_cache *objects; struct kmem_cache *vmas; + struct kmem_cache *luts; struct kmem_cache *requests; struct kmem_cache *dependencies; struct kmem_cache *priorities; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 7dcac3bfb771..3388c65b54a3 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3252,19 +3252,33 @@ i915_gem_idle_work_handler(struct work_struct *work) void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) { + struct drm_i915_private *i915 = to_i915(gem->dev); struct drm_i915_gem_object *obj = to_intel_bo(gem); struct drm_i915_file_private *fpriv = file->driver_priv; struct i915_vma *vma, *vn; + struct i915_lut_handle *lut, *ln; mutex_lock(&obj->base.dev->struct_mutex); + + list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) { + struct i915_gem_context *ctx = lut->ctx; + + if (ctx->file_priv != fpriv) + continue; + + radix_tree_delete(&ctx->handles_vma, lut->handle); + i915_gem_object_put(obj); + + list_del(&lut->obj_link); + list_del(&lut->ctx_link); + + kmem_cache_free(i915->luts, lut); + } + list_for_each_entry_safe(vma, vn, &obj->vma_list, obj_link) if (vma->vm->file == fpriv) i915_vma_close(vma); - vma = obj->vma_hashed; - if (vma && vma->ctx->file_priv == fpriv) - i915_vma_unlink_ctx(vma); - if (i915_gem_object_is_active(obj) && !i915_gem_object_has_active_reference(obj)) { i915_gem_object_set_active_reference(obj); @@ -4259,6 +4273,7 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj, INIT_LIST_HEAD(&obj->global_link); INIT_LIST_HEAD(&obj->userfault_link); INIT_LIST_HEAD(&obj->vma_list); + INIT_LIST_HEAD(&obj->lut_list); INIT_LIST_HEAD(&obj->batch_pool_link); obj->ops = ops; @@ -4897,12 +4912,16 @@ i915_gem_load_init(struct drm_i915_private *dev_priv) if (!dev_priv->vmas) goto err_objects; + dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0); + if (!dev_priv->luts) + goto err_vmas; + dev_priv->requests = KMEM_CACHE(drm_i915_gem_request, SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_TYPESAFE_BY_RCU); if (!dev_priv->requests) - goto err_vmas; + goto err_luts; dev_priv->dependencies = KMEM_CACHE(i915_dependency, SLAB_HWCACHE_ALIGN | @@ -4949,6 +4968,8 @@ i915_gem_load_init(struct drm_i915_private *dev_priv) kmem_cache_destroy(dev_priv->dependencies); err_requests: kmem_cache_destroy(dev_priv->requests); +err_luts: + kmem_cache_destroy(dev_priv->luts); err_vmas: kmem_cache_destroy(dev_priv->vmas); err_objects: @@ -4971,6 +4992,7 @@ void i915_gem_load_cleanup(struct drm_i915_private *dev_priv) kmem_cache_destroy(dev_priv->priorities); kmem_cache_destroy(dev_priv->dependencies); kmem_cache_destroy(dev_priv->requests); + kmem_cache_destroy(dev_priv->luts); kmem_cache_destroy(dev_priv->vmas); kmem_cache_destroy(dev_priv->objects); diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 39ed58a21fc1..dc182a8dd4e8 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -93,69 +93,21 @@ #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1 -/* Initial size (as log2) to preallocate the handle->object hashtable */ -#define VMA_HT_BITS 2u /* 4 x 2 pointers, 64 bytes minimum */ - -static void resize_vma_ht(struct work_struct *work) +static void lut_close(struct i915_gem_context *ctx) { - struct i915_gem_context_vma_lut *lut = - container_of(work, typeof(*lut), resize); - unsigned int bits, new_bits, size, i; - struct hlist_head *new_ht; - - GEM_BUG_ON(!(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS)); - - bits = 1 + ilog2(4*lut->ht_count/3 + 1); - new_bits = min_t(unsigned int, - max(bits, VMA_HT_BITS), - sizeof(unsigned int) * BITS_PER_BYTE - 1); - if (new_bits == lut->ht_bits) - goto out; - - new_ht = kzalloc(sizeof(*new_ht)<<new_bits, GFP_KERNEL | __GFP_NOWARN); - if (!new_ht) - new_ht = vzalloc(sizeof(*new_ht)<<new_bits); - if (!new_ht) - /* Pretend resize succeeded and stop calling us for a bit! */ - goto out; + struct i915_lut_handle *lut, *ln; + struct radix_tree_iter iter; + void __rcu **slot; - size = BIT(lut->ht_bits); - for (i = 0; i < size; i++) { - struct i915_vma *vma; - struct hlist_node *tmp; - - hlist_for_each_entry_safe(vma, tmp, &lut->ht[i], ctx_node) - hlist_add_head(&vma->ctx_node, - &new_ht[hash_32(vma->ctx_handle, - new_bits)]); + list_for_each_entry_safe(lut, ln, &ctx->handles_list, ctx_link) { + list_del(&lut->obj_link); + kmem_cache_free(ctx->i915->luts, lut); } - kvfree(lut->ht); - lut->ht = new_ht; - lut->ht_bits = new_bits; -out: - smp_store_release(&lut->ht_size, BIT(bits)); - GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS); -} -static void vma_lut_free(struct i915_gem_context *ctx) -{ - struct i915_gem_context_vma_lut *lut = &ctx->vma_lut; - unsigned int i, size; - - if (lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS) - cancel_work_sync(&lut->resize); - - size = BIT(lut->ht_bits); - for (i = 0; i < size; i++) { - struct i915_vma *vma; - - hlist_for_each_entry(vma, &lut->ht[i], ctx_node) { - vma->obj->vma_hashed = NULL; - vma->ctx = NULL; - i915_vma_put(vma); - } + radix_tree_for_each_slot(slot, &ctx->handles_vma, &iter, 0) { + i915_vma_put(rcu_dereference_raw(*slot)); + radix_tree_iter_delete(&ctx->handles_vma, &iter, slot); } - kvfree(lut->ht); } void i915_gem_context_free(struct kref *ctx_ref) @@ -167,7 +119,6 @@ void i915_gem_context_free(struct kref *ctx_ref) trace_i915_context_free(ctx); GEM_BUG_ON(!i915_gem_context_is_closed(ctx)); - vma_lut_free(ctx); i915_ppgtt_put(ctx->ppgtt); for (i = 0; i < I915_NUM_ENGINES; i++) { @@ -195,8 +146,11 @@ void i915_gem_context_free(struct kref *ctx_ref) static void context_close(struct i915_gem_context *ctx) { i915_gem_context_set_closed(ctx); + + lut_close(ctx); if (ctx->ppgtt) i915_ppgtt_close(&ctx->ppgtt->base); + ctx->file_priv = ERR_PTR(-EBADF); i915_gem_context_put(ctx); } @@ -269,16 +223,8 @@ __create_hw_context(struct drm_i915_private *dev_priv, ctx->i915 = dev_priv; ctx->priority = I915_PRIORITY_NORMAL; - ctx->vma_lut.ht_bits = VMA_HT_BITS; - ctx->vma_lut.ht_size = BIT(VMA_HT_BITS); - BUILD_BUG_ON(BIT(VMA_HT_BITS) == I915_CTX_RESIZE_IN_PROGRESS); - ctx->vma_lut.ht = kcalloc(ctx->vma_lut.ht_size, - sizeof(*ctx->vma_lut.ht), - GFP_KERNEL); - if (!ctx->vma_lut.ht) - goto err_out; - - INIT_WORK(&ctx->vma_lut.resize, resize_vma_ht); + INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL); + INIT_LIST_HEAD(&ctx->handles_list); /* Default context will never have a file_priv */ ret = DEFAULT_CONTEXT_HANDLE; @@ -328,8 +274,6 @@ __create_hw_context(struct drm_i915_private *dev_priv, put_pid(ctx->pid); idr_remove(&file_priv->context_idr, ctx->user_handle); err_lut: - kvfree(ctx->vma_lut.ht); -err_out: context_close(ctx); return ERR_PTR(ret); } diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h index 82c99ba92ad3..cd0c5d8e0db7 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.h +++ b/drivers/gpu/drm/i915/i915_gem_context.h @@ -27,6 +27,7 @@ #include <linux/bitops.h> #include <linux/list.h> +#include <linux/radix-tree.h> struct pid; @@ -143,32 +144,6 @@ struct i915_gem_context { /** ggtt_offset_bias: placement restriction for context objects */ u32 ggtt_offset_bias; - struct i915_gem_context_vma_lut { - /** ht_size: last request size to allocate the hashtable for. */ - unsigned int ht_size; -#define I915_CTX_RESIZE_IN_PROGRESS BIT(0) - /** ht_bits: real log2(size) of hashtable. */ - unsigned int ht_bits; - /** ht_count: current number of entries inside the hashtable */ - unsigned int ht_count; - - /** ht: the array of buckets comprising the simple hashtable */ - struct hlist_head *ht; - - /** - * resize: After an execbuf completes, we check the load factor - * of the hashtable. If the hashtable is too full, or too empty, - * we schedule a task to resize the hashtable. During the - * resize, the entries are moved between different buckets and - * so we cannot simultaneously read the hashtable as it is - * being resized (unlike rhashtable). Therefore we treat the - * active work as a strong barrier, pausing a subsequent - * execbuf to wait for the resize worker to complete, if - * required. - */ - struct work_struct resize; - } vma_lut; - /** engine: per-engine logical HW state */ struct intel_context { struct i915_vma *state; @@ -199,6 +174,9 @@ struct i915_gem_context { /** remap_slice: Bitmask of cache lines that need remapping */ u8 remap_slice; + + struct radix_tree_root handles_vma; + struct list_head handles_list; }; static inline bool i915_gem_context_is_closed(const struct i915_gem_context *ctx) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 18d6581bc6c5..adcea065985c 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -637,19 +637,6 @@ static int eb_reserve(struct i915_execbuffer *eb) } while (1); } -static inline struct hlist_head * -ht_head(const struct i915_gem_context_vma_lut *lut, u32 handle) -{ - return &lut->ht[hash_32(handle, lut->ht_bits)]; -} - -static inline bool -ht_needs_resize(const struct i915_gem_context_vma_lut *lut) -{ - return (4*lut->ht_count > 3*lut->ht_size || - 4*lut->ht_count + 1 < lut->ht_size); -} - static unsigned int eb_batch_index(const struct i915_execbuffer *eb) { if (eb->args->flags & I915_EXEC_BATCH_FIRST) @@ -684,31 +671,22 @@ static int eb_select_context(struct i915_execbuffer *eb) static int eb_lookup_vmas(struct i915_execbuffer *eb, unsigned int flags) { - struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut; - struct drm_i915_gem_object *uninitialized_var(obj); + struct drm_i915_gem_object *obj; + struct radix_tree_root *handles_vma = &eb->ctx->handles_vma; unsigned int i; int err; INIT_LIST_HEAD(&eb->relocs); INIT_LIST_HEAD(&eb->unbound); - if (unlikely(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS)) - flush_work(&lut->resize); - GEM_BUG_ON(lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS); - for (i = 0; i < eb->buffer_count; i++) { u32 handle = eb->exec[i].handle; - struct hlist_head *hl = ht_head(lut, handle); struct i915_vma *vma; + struct i915_lut_handle *lut; - hlist_for_each_entry(vma, hl, ctx_node) { - GEM_BUG_ON(vma->ctx != eb->ctx); - - if (vma->ctx_handle != handle) - continue; - + vma = radix_tree_lookup(handles_vma, handle); + if (likely(vma)) goto add_vma; - } obj = i915_gem_object_lookup(eb->file, handle); if (unlikely(!obj)) { @@ -716,43 +694,36 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb, unsigned int flags) goto err_vma; } - flags |= __EXEC_OBJECT_HAS_REF; - vma = i915_vma_instance(obj, eb->vm, NULL); if (unlikely(IS_ERR(vma))) { err = PTR_ERR(vma); goto err_obj; } - /* First come, first served */ - if (!vma->ctx) { - vma->ctx = eb->ctx; - vma->ctx_handle = handle; - hlist_add_head(&vma->ctx_node, hl); - lut->ht_count++; - lut->ht_size |= I915_CTX_RESIZE_IN_PROGRESS; - if (i915_vma_is_ggtt(vma)) { - GEM_BUG_ON(obj->vma_hashed); - obj->vma_hashed = vma; - } + lut = kmem_cache_alloc(eb->i915->luts, GFP_KERNEL); + if (unlikely(!lut)) { + err = -ENOMEM; + goto err_obj; + } - /* transfer ref to ctx */ - flags &= ~__EXEC_OBJECT_HAS_REF; + /* transfer ref to ctx */ + err = radix_tree_insert(handles_vma, handle, vma); + if (unlikely(err)) { + kfree(lut); + goto err_obj; } + list_add(&lut->obj_link, &obj->lut_list); + list_add(&lut->ctx_link, &eb->ctx->handles_list); + lut->ctx = eb->ctx; + lut->handle = handle; + add_vma: err = eb_add_vma(eb, i, vma, flags); if (unlikely(err)) goto err_vma; } - if (lut->ht_size & I915_CTX_RESIZE_IN_PROGRESS) { - if (ht_needs_resize(lut)) - queue_work(system_highpri_wq, &lut->resize); - else - lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS; - } - /* take note of the batch buffer before we might reorder the lists */ i = eb_batch_index(eb); eb->batch = eb->vma[i]; @@ -778,7 +749,6 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb, unsigned int flags) i915_gem_object_put(obj); err_vma: eb->vma[i] = NULL; - lut->ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS; return err; } diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h index 5b19a4916a4d..8188d3cf5e11 100644 --- a/drivers/gpu/drm/i915/i915_gem_object.h +++ b/drivers/gpu/drm/i915/i915_gem_object.h @@ -35,6 +35,13 @@ #include "i915_selftest.h" +struct i915_lut_handle { + struct list_head obj_link; + struct list_head ctx_link; + struct i915_gem_context *ctx; + u32 handle; +}; + struct drm_i915_gem_object_ops { unsigned int flags; #define I915_GEM_OBJECT_HAS_STRUCT_PAGE BIT(0) @@ -86,7 +93,8 @@ struct drm_i915_gem_object { * They are also added to @vma_list for easy iteration. */ struct rb_root vma_tree; - struct i915_vma *vma_hashed; + + struct list_head lut_list; /** Stolen memory for this object, instead of being backed by shmem. */ struct drm_mm_node *stolen; diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 532c709febbd..93329a809764 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -591,33 +591,11 @@ static void i915_vma_destroy(struct i915_vma *vma) kmem_cache_free(to_i915(vma->obj->base.dev)->vmas, vma); } -void i915_vma_unlink_ctx(struct i915_vma *vma) -{ - struct i915_gem_context *ctx = vma->ctx; - - if (ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS) { - cancel_work_sync(&ctx->vma_lut.resize); - ctx->vma_lut.ht_size &= ~I915_CTX_RESIZE_IN_PROGRESS; - } - - __hlist_del(&vma->ctx_node); - ctx->vma_lut.ht_count--; - - if (i915_vma_is_ggtt(vma)) - vma->obj->vma_hashed = NULL; - vma->ctx = NULL; - - i915_vma_put(vma); -} - void i915_vma_close(struct i915_vma *vma) { GEM_BUG_ON(i915_vma_is_closed(vma)); vma->flags |= I915_VMA_CLOSED; - if (vma->ctx) - i915_vma_unlink_ctx(vma); - list_del(&vma->obj_link); rb_erase(&vma->obj_node, &vma->obj->vma_tree); diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h index 5f1356e93c6c..6d35cf7ddcfb 100644 --- a/drivers/gpu/drm/i915/i915_vma.h +++ b/drivers/gpu/drm/i915/i915_vma.h @@ -115,10 +115,6 @@ struct i915_vma { unsigned int *exec_flags; struct hlist_node exec_node; u32 exec_handle; - - struct i915_gem_context *ctx; - struct hlist_node ctx_node; - u32 ctx_handle; }; struct i915_vma * diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c index f8b9cc212b02..3ee31ef84607 100644 --- a/drivers/gpu/drm/i915/selftests/mock_context.c +++ b/drivers/gpu/drm/i915/selftests/mock_context.c @@ -40,18 +40,13 @@ mock_context(struct drm_i915_private *i915, INIT_LIST_HEAD(&ctx->link); ctx->i915 = i915; - ctx->vma_lut.ht_bits = VMA_HT_BITS; - ctx->vma_lut.ht_size = BIT(VMA_HT_BITS); - ctx->vma_lut.ht = kcalloc(ctx->vma_lut.ht_size, - sizeof(*ctx->vma_lut.ht), - GFP_KERNEL); - if (!ctx->vma_lut.ht) - goto err_free; + INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL); + INIT_LIST_HEAD(&ctx->handles_list); ret = ida_simple_get(&i915->context_hw_ida, 0, MAX_CONTEXT_HW_ID, GFP_KERNEL); if (ret < 0) - goto err_vma_ht; + goto err_handles; ctx->hw_id = ret; if (name) { @@ -66,9 +61,7 @@ mock_context(struct drm_i915_private *i915, return ctx; -err_vma_ht: - kvfree(ctx->vma_lut.ht); -err_free: +err_handles: kfree(ctx); return NULL; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 898e87998417..3527eb364964 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -2022,6 +2022,7 @@ void radix_tree_iter_delete(struct radix_tree_root *root, if (__radix_tree_delete(root, iter->node, slot)) iter->index = iter->next_index; } +EXPORT_SYMBOL(radix_tree_iter_delete); /** * radix_tree_delete_item - delete an item from a radix tree -- 2.11.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx