On ke, 2017-03-29 at 16:56 +0100, Chris Wilson wrote: > The major scaling bottleneck in execbuffer is the processing of the > execobjects. Creating an auxiliary list is inefficient when compared to > using the execobject array we already have allocated. > > Reservation is then split into phases. As we lookup up the VMA, we > try and bind it back into active location. Only if that fails, do we add > it to the unbound list for phase 2. In phase 2, we try and add all those > objects that could not fit into their previous location, with fallback > to retrying all objects and evicting the VM in case of severe > fragmentation. (This is the same as before, except that phase 1 is now > done inline with looking up the VMA to avoid an iteration over the > execobject array. In the ideal case, we eliminate the separate reservation > phase). During the reservation phase, we only evict from the VM between > passes (rather than currently as we try to fit every new VMA). In > testing with Unreal Engine's Atlantis demo which stresses the eviction > logic on gen7 class hardware, this speed up the framerate by a factor of > 2. > > The second loop amalgamation is between move_to_gpu and move_to_active. > As we always submit the request, even if incomplete, we can use the > current request to track active VMA as we perform the flushes and > synchronisation required. > > The next big advancement is to avoid copying back to the user any > execobjects and relocations that are not changed. > > v2: Add a Theory of Operation spiel. > v3: Fall back to slow relocations in preparation for flushing userptrs. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> <SNIP> > struct i915_execbuffer { > struct drm_i915_private *i915; > struct drm_file *file; > @@ -63,19 +180,24 @@ struct i915_execbuffer { > struct i915_address_space *vm; > struct i915_vma *batch; > struct drm_i915_gem_request *request; > - u32 batch_start_offset; > - u32 batch_len; > - unsigned int dispatch_flags; > - struct drm_i915_gem_exec_object2 shadow_exec_entry; > - bool need_relocs; > - struct list_head vmas; > + unsigned int buffer_count; > + struct list_head unbound; > + struct list_head relocs; > struct reloc_cache { > struct drm_mm_node node; > unsigned long vaddr; > unsigned int page; > bool use_64bit_reloc : 1; > + bool has_llc : 1; > + bool has_fence : 1; > + bool needs_unfenced : 1; > } reloc_cache; > - int lut_mask; > + u64 invalid_flags; > + u32 context_flags; > + u32 dispatch_flags; > + u32 batch_start_offset; > + u32 batch_len; > + int lut_size; > struct hlist_head *buckets; Please document (new) members. > +static inline u64 gen8_noncanonical_addr(u64 address) > +{ > + return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1); GENMASK_ULL > @@ -106,71 +256,302 @@ eb_create(struct i915_execbuffer *eb) > return -ENOMEM; > } > > - eb->lut_mask = size; > + eb->lut_size = size; > } else { > - eb->lut_mask = -eb->args->buffer_count; > + eb->lut_size = -eb->buffer_count; Document the negative meaning in the doc mentioned above. > +static bool > +eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry, > + const struct i915_vma *vma) > +{ > + if ((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0) Lets try to stick to one convention, we gave up == NULL too, so !(a & FOO). <SNIP> > + if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 && > + (vma->node.start + vma->node.size - 1) >> 32) upper_32_bits for clarity? > +static void > +eb_pin_vma(struct i915_execbuffer *eb, > + struct drm_i915_gem_exec_object2 *entry, > + struct i915_vma *vma) > +{ > + u64 flags; > + > + flags = vma->node.start; I'd be more comfortable if some mask was applied here. Or at least GEM_BUG_ON(flags & BAD_FLAGS); > +static inline void > +eb_unreserve_vma(struct i915_vma *vma, > + struct drm_i915_gem_exec_object2 *entry) > { > - struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; > - > - __eb_unreserve_vma(vma, entry); > - entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN); > + if (entry->flags & __EXEC_OBJECT_HAS_PIN) { I'd treat the if more as an early return, but I guess you have more code coming. > +static int > +eb_add_vma(struct i915_execbuffer *eb, > > + struct drm_i915_gem_exec_object2 *entry, > > + struct i915_vma *vma) > { > > - struct i915_vma *vma; > > + int ret; > > > - list_for_each_entry(vma, &eb->vmas, exec_link) { > > - eb_unreserve_vma(vma); > > - i915_vma_put(vma); > > - vma->exec_entry = NULL; > > + GEM_BUG_ON(i915_vma_is_closed(vma)); > + > + if ((eb->args->flags & __EXEC_VALIDATED) == 0) { smells like a function here. <SNIP> > } > > - if (eb->lut_mask >= 0) > - memset(eb->buckets, 0, > - sizeof(struct hlist_head) << eb->lut_mask); > + vma->exec_entry = entry; > + entry->rsvd2 = (uintptr_t)vma; Umm, there was a helper introduced in some patch. <SNIP> Could add a comment as to why do this? > + if ((entry->flags & EXEC_OBJECT_PINNED) == 0) > + entry->flags |= eb->context_flags; > + <SNIP> > + > +static int > +eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma) > +{ <SNIP> > - i915_vma_get(vma); > - __exec_to_vma(&eb->exec[i]) = (uintptr_t)vma; > - return true; > + if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) { > + ret = i915_vma_get_fence(vma); > + if (ret) > + return ret; > + > + if (i915_vma_pin_fence(vma)) > + entry->flags |= __EXEC_OBJECT_HAS_FENCE; > + } Smells like duplicate code with eb_vma_pin. <SNIP> > static int > eb_lookup_vmas(struct i915_execbuffer *eb) > { <SNIP> > - if (ht_needs_resize(eb->ctx)) { > - eb->ctx->vma_lut.ht_size |= I915_CTX_RESIZE_IN_PROGRESS; > - queue_work(system_highpri_wq, &eb->ctx->vma_lut.resize); > - } > + if (eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS) { > + struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut; You could hoist the lut variable, its used quite a few times. > @@ -616,16 +1048,15 @@ relocate_entry(struct drm_i915_gem_object *obj, > goto repeat; > } > > - return 0; > + return gen8_canonical_addr(target->node.start) | 1; Magic bit. > +static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma) > { > #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry)) > - struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)]; > - struct drm_i915_gem_relocation_entry __user *user_relocs; > - struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; > - int remain, ret = 0; > - > - user_relocs = u64_to_user_ptr(entry->relocs_ptr); > + struct drm_i915_gem_relocation_entry stack[N_RELOC(512)]; > + struct drm_i915_gem_relocation_entry __user *urelocs; > + const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; > + unsigned int remain; > > + urelocs = u64_to_user_ptr(entry->relocs_ptr); > remain = entry->relocation_count; > - while (remain) { > - struct drm_i915_gem_relocation_entry *r = stack_reloc; > - unsigned long unwritten; > - unsigned int count; > + if (unlikely(remain > ULONG_MAX / sizeof(*urelocs))) How bout N_RELOC(ULONG_MAX) ? > @@ -732,66 +1164,66 @@ static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb) > * this is bad and so lockdep complains vehemently. > */ > pagefault_disable(); > - unwritten = __copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0])); > - pagefault_enable(); > - if (unlikely(unwritten)) { > - ret = -EFAULT; > + if (__copy_from_user_inatomic(r, urelocs, count*sizeof(r[0]))) { > + pagefault_enable(); > + remain = -EFAULT; > goto out; > } > + pagefault_enable(); Why dupe the pagefault_enable? > > + remain -= count; > do { > - u64 offset = r->presumed_offset; > + u64 offset = eb_relocate_entry(eb, vma, r); > > - ret = eb_relocate_entry(vma, eb, r); > - if (ret) > + if (likely(offset == 0)) { Sparse not complaining? > + } else if ((s64)offset < 0) { > + remain = (s64)offset; > goto out; > - > - if (r->presumed_offset != offset) { > + } else { > + /* Note that reporting an error now > + * leaves everything in an inconsistent > + * state as we have *already* changed > + * the relocation value inside the > + * object. As we have not changed the > + * reloc.presumed_offset or will not > + * change the execobject.offset, on the > + * call we may not rewrite the value > + * inside the object, leaving it > + * dangling and causing a GPU hang. > + */ > pagefault_disable(); > - unwritten = __put_user(r->presumed_offset, > - &user_relocs->presumed_offset); > + __put_user(offset & ~1, Magic. > + &urelocs[r-stack].presumed_offset); There's the comment, so maybe worth if (__put_user()) DRM_DEBUG? > pagefault_enable(); <SNIP> > +static int check_relocations(const struct drm_i915_gem_exec_object2 *entry) > { <SNIP> > + const unsigned long relocs_max = > + ULONG_MAX / sizeof(struct drm_i915_gem_relocation_entry); Could be a define, this is used above too. <SNIP> > + return __get_user(c, end - 1); What's the point in this final check? > } > > -static bool > -need_reloc_mappable(struct i915_vma *vma) > +static int > +eb_copy_relocations(const struct i915_execbuffer *eb) > { > - struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; > - > - if (entry->relocation_count == 0) > - return false; > - > - if (!i915_vma_is_ggtt(vma)) > - return false; > + const unsigned int count = eb->buffer_count; > + unsigned int i; > + int ret; > > - /* See also use_cpu_reloc() */ > - if (HAS_LLC(to_i915(vma->obj->base.dev))) > - return false; > + for (i = 0; i < count; i++) { > + struct drm_i915_gem_relocation_entry __user *urelocs; > + struct drm_i915_gem_relocation_entry *relocs; > + unsigned int nreloc = eb->exec[i].relocation_count, j; No hidden variables in assignment lines. > -static bool > -eb_vma_misplaced(struct i915_vma *vma) > -{ > - struct drm_i915_gem_exec_object2 *entry = vma->exec_entry; > + urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr); > + size = nreloc * sizeof(*relocs); > > - WARN_ON(entry->flags & __EXEC_OBJECT_NEEDS_MAP && > - !i915_vma_is_ggtt(vma)); > + relocs = drm_malloc_gfp(size, 1, GFP_TEMPORARY); > + if (!relocs) { > + drm_free_large(relocs); > + ret = -ENOMEM; > + goto err; > + } > > - if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment)) > - return true; > + /* copy_from_user is limited to 4GiB */ > + j = 0; > + do { > + u32 len = min_t(u64, 1ull<<31, size); BIT_ULL(31) > +static void eb_export_fence(struct drm_i915_gem_object *obj, > + struct drm_i915_gem_request *req, > + unsigned int flags) > +{ > + struct reservation_object *resv = obj->resv; > + > + /* Ignore errors from failing to allocate the new fence, we can't > + * handle an error right now. Worst case should be missed > + * synchronisation leading to rendering corruption. > + */ Worthy DRM_DEBUG? > @@ -1155,10 +1524,33 @@ eb_move_to_gpu(struct i915_execbuffer *eb) > } > > ret = i915_gem_request_await_object > - (eb->request, obj, vma->exec_entry->flags & EXEC_OBJECT_WRITE); > + (eb->request, obj, entry->flags & EXEC_OBJECT_WRITE); > if (ret) > return ret; > + > +skip_flushes: > + obj->base.write_domain = 0; > + if (entry->flags & EXEC_OBJECT_WRITE) { > + obj->base.read_domains = 0; > + if (!obj->cache_dirty && gpu_write_needs_clflush(obj)) > + obj->cache_dirty = true; > + intel_fb_obj_invalidate(obj, ORIGIN_CS); > + } > + obj->base.read_domains |= I915_GEM_GPU_DOMAINS; > + > + i915_vma_move_to_active(vma, eb->request, entry->flags); > + __eb_unreserve_vma(vma, entry); > + vma->exec_entry = NULL; This seems like a bugfix hunk lost to refactoring patch. > @@ -1377,16 +1629,16 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req) > return -EINVAL; > } > > - cs = intel_ring_begin(req, 4 * 3); > + cs = intel_ring_begin(req, 4 * 2 + 2); > if (IS_ERR(cs)) > return PTR_ERR(cs); > > + *cs++ = MI_LOAD_REGISTER_IMM(4); > for (i = 0; i < 4; i++) { > - *cs++ = MI_LOAD_REGISTER_IMM(1); > *cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i)); > *cs++ = 0; > } > - > + *cs++ = MI_NOOP; > intel_ring_advance(req, cs); Again a lost hunk. > > > return 0; > @@ -1422,10 +1674,11 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master) > goto out; > > vma->exec_entry = > - memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry)); > + memset(&eb->exec[eb->buffer_count++], > + 0, sizeof(*vma->exec_entry)); > vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN; > - i915_gem_object_get(shadow_batch_obj); > - list_add_tail(&vma->exec_link, &eb->vmas); > + vma->exec_entry->rsvd2 = (uintptr_t)vma; Use the helper. > @@ -1901,56 +2135,63 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data, > struct drm_file *file) > { > struct drm_i915_gem_execbuffer2 *args = data; > - struct drm_i915_gem_exec_object2 *exec2_list = NULL; > + struct drm_i915_gem_exec_object2 *exec2_list; > int ret; > > if (args->buffer_count < 1 || > - args->buffer_count > UINT_MAX / sizeof(*exec2_list)) { > + args->buffer_count >= UINT_MAX / sizeof(*exec2_list) - 1) { > DRM_DEBUG("execbuf2 with %d buffers\n", args->buffer_count); > return -EINVAL; > } > > - exec2_list = drm_malloc_gfp(args->buffer_count, > + if (!i915_gem_check_execbuffer(args)) > + return -EINVAL; > + > + exec2_list = drm_malloc_gfp(args->buffer_count + 1, The "+ 1" is very unintuitive without a comment. With the comments, this is (90%); Reviewed-by: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> I'm pretty darn sure I ain't reviewing this again without some very specific changelog and inter-diff. Regards, Joonas -- Joonas Lahtinen Open Source Technology Center Intel Corporation _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx