With a little complexity to handle cmds straddling page boundaries, we can completely avoiding having to vmap the batch and the shadow batch objects whilst running the command parser. On ivb i7-3720MQ: x11perf -dot before 54.3M, after 53.2M (max 203M) glxgears before 7110 fps, after 7300 fps (max 7860 fps) Before: Time to blt 16384 bytes x 1: 12.400µs, 1.2GiB/s Time to blt 16384 bytes x 4096: 3.055µs, 5.0GiB/s After: Time to blt 16384 bytes x 1: 8.600µs, 1.8GiB/s Time to blt 16384 bytes x 4096: 2.456µs, 6.2GiB/s Removing the vmap is mostly a win, except we lose in a few cases where the batch size is greater than a page due to the extra complexity (loss of a simple cache efficient large copy, and boundary handling). v2: Reorder so that we do check oacontrol remaining set at end-of-batch Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_cmd_parser.c | 299 ++++++++++++++++----------------- 1 file changed, 148 insertions(+), 151 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 9605ff8f2fcd..4a80ab953715 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -818,100 +818,6 @@ static bool valid_reg(const u32 *table, int count, u32 addr) return false; } -static u32 *vmap_batch(struct drm_i915_gem_object *obj, - unsigned start, unsigned len) -{ - int i; - void *addr = NULL; - struct sg_page_iter sg_iter; - int first_page = start >> PAGE_SHIFT; - int last_page = (len + start + 4095) >> PAGE_SHIFT; - int npages = last_page - first_page; - struct page **pages; - - pages = drm_malloc_ab(npages, sizeof(*pages)); - if (pages == NULL) { - DRM_DEBUG_DRIVER("Failed to get space for pages\n"); - goto finish; - } - - i = 0; - for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) { - pages[i++] = sg_page_iter_page(&sg_iter); - if (i == npages) - break; - } - - addr = vmap(pages, i, 0, PAGE_KERNEL); - if (addr == NULL) { - DRM_DEBUG_DRIVER("Failed to vmap pages\n"); - goto finish; - } - -finish: - if (pages) - drm_free_large(pages); - return (u32*)addr; -} - -/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */ -static u32 *copy_batch(struct drm_i915_gem_object *dest_obj, - struct drm_i915_gem_object *src_obj, - u32 batch_start_offset, - u32 batch_len) -{ - int needs_clflush = 0; - void *src_base, *src; - void *dst = NULL; - int ret; - - if (batch_len > dest_obj->base.size || - batch_len + batch_start_offset > src_obj->base.size) - return ERR_PTR(-E2BIG); - - if (WARN_ON(dest_obj->pages_pin_count == 0)) - return ERR_PTR(-ENODEV); - - ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush); - if (ret) { - DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n"); - return ERR_PTR(ret); - } - - src_base = vmap_batch(src_obj, batch_start_offset, batch_len); - if (!src_base) { - DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n"); - ret = -ENOMEM; - goto unpin_src; - } - - ret = i915_gem_object_set_to_cpu_domain(dest_obj, true); - if (ret) { - DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n"); - goto unmap_src; - } - - dst = vmap_batch(dest_obj, 0, batch_len); - if (!dst) { - DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n"); - ret = -ENOMEM; - goto unmap_src; - } - - src = src_base + offset_in_page(batch_start_offset); - if (needs_clflush) - drm_clflush_virt_range(src, batch_len); - - memcpy(dst, src, batch_len); - -unmap_src: - vunmap(src_base); -unpin_src: - i915_gem_object_unpin_pages(src_obj); - - return ret ? ERR_PTR(ret) : dst; -} - /** * i915_needs_cmd_parser() - should a given ring use software command parsing? * @ring: the ring in question @@ -1046,16 +952,34 @@ int i915_parse_cmds(struct intel_engine_cs *ring, u32 batch_len, bool is_master) { - u32 *cmd, *batch_base, *batch_end; + u32 tmp[128]; + struct sg_page_iter src_iter, dst_iter; + const struct drm_i915_cmd_descriptor *desc; + int needs_clflush = 0; + void *src, *dst; + unsigned in, out; + u32 *buf, partial = 0, length; struct drm_i915_cmd_descriptor default_desc = { 0 }; bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */ int ret = 0; - batch_base = copy_batch(shadow_batch_obj, batch_obj, - batch_start_offset, batch_len); - if (IS_ERR(batch_base)) { - DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n"); - return PTR_ERR(batch_base); + if (batch_len > shadow_batch_obj->base.size || + batch_len + batch_start_offset > batch_obj->base.size) + return -E2BIG; + + if (WARN_ON(shadow_batch_obj->pages_pin_count == 0)) + return -ENODEV; + + ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush); + if (ret) { + DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n"); + return ret; + } + + ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true); + if (ret) { + DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n"); + goto unpin; } /* @@ -1063,68 +987,141 @@ int i915_parse_cmds(struct intel_engine_cs *ring, * large or larger and copy_batch() will write MI_NOPs to the extra * space. Parsing should be faster in some cases this way. */ - batch_end = batch_base + (batch_len / sizeof(*batch_end)); + ret = -EINVAL; + + __sg_page_iter_start(&dst_iter, + shadow_batch_obj->pages->sgl, + shadow_batch_obj->pages->nents, + 0); + __sg_page_iter_next(&dst_iter); + dst = kmap_atomic(sg_page_iter_page(&dst_iter)); + + in = offset_in_page(batch_start_offset); + out = 0; + for_each_sg_page(batch_obj->pages->sgl, + &src_iter, + batch_obj->pages->nents, + batch_start_offset >> PAGE_SHIFT) { + u32 this, i, j, k; + u32 *cmd, *page_end, *batch_end; + + this = batch_len; + if (this > PAGE_SIZE - in) + this = PAGE_SIZE - in; + + src = kmap_atomic(sg_page_iter_page(&src_iter)); + if (needs_clflush) + drm_clflush_virt_range(src + in, this); + + i = this; + j = in; + do { + /* We keep dst around so that we do not blow + * the CPU caches immediately after the copy (due + * to the kunmap_atomic(dst) doing a TLB flush. + */ + if (out == PAGE_SIZE) { + __sg_page_iter_next(&dst_iter); + kunmap_atomic(dst); + dst = kmap_atomic(sg_page_iter_page(&dst_iter)); + out = 0; + } - cmd = batch_base; - while (cmd < batch_end) { - const struct drm_i915_cmd_descriptor *desc; - u32 length; + k = i; + if (k > PAGE_SIZE - out) + k = PAGE_SIZE - out; + if (k == PAGE_SIZE) + copy_page(dst, src); + else + memcpy(dst + out, src + j, k); + + out += k; + j += k; + i -= k; + } while (i); + + cmd = src + in; + page_end = (void *)cmd + this; + batch_end = (void *)cmd + batch_len; + + if (partial) { + memcpy(tmp + partial, cmd, (length - partial)*4); + cmd -= partial; + partial = 0; + buf = tmp; + goto check; + } - if (*cmd == MI_BATCH_BUFFER_END) - break; + do { + if (*cmd == MI_BATCH_BUFFER_END) { + if (oacontrol_set) { + DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n"); + ret = -EINVAL; + } else + ret = 0; + goto unmap; + } - desc = find_cmd(ring, *cmd, &default_desc); - if (!desc) { - DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n", - *cmd); - ret = -EINVAL; - break; - } + desc = find_cmd(ring, *cmd, &default_desc); + if (!desc) { + DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n", + *cmd); + goto unmap; + } - /* - * If the batch buffer contains a chained batch, return an - * error that tells the caller to abort and dispatch the - * workload as a non-secure batch. - */ - if (desc->cmd.value == MI_BATCH_BUFFER_START) { - ret = -EACCES; - break; - } + /* + * If the batch buffer contains a chained batch, return an + * error that tells the caller to abort and dispatch the + * workload as a non-secure batch. + */ + if (desc->cmd.value == MI_BATCH_BUFFER_START) { + ret = -EACCES; + goto unmap; + } - if (desc->flags & CMD_DESC_FIXED) - length = desc->length.fixed; - else - length = ((*cmd & desc->length.mask) + LENGTH_BIAS); - - if ((batch_end - cmd) < length) { - DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n", - *cmd, - length, - batch_end - cmd); - ret = -EINVAL; - break; - } + if (desc->flags & CMD_DESC_FIXED) + length = desc->length.fixed; + else + length = ((*cmd & desc->length.mask) + LENGTH_BIAS); + + if (cmd + length > page_end) { + if (length + cmd > batch_end) { + DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n", + *cmd, length, batch_end - cmd); + goto unmap; + } + + if (WARN_ON(length > sizeof(tmp)/4)) { + ret = -ENODEV; + goto unmap; + } + + partial = page_end - cmd; + memcpy(tmp, cmd, partial*4); + break; + } - if (!check_cmd(ring, desc, cmd, is_master, &oacontrol_set)) { - ret = -EINVAL; - break; - } + buf = cmd; +check: + if (!check_cmd(ring, desc, buf, is_master, &oacontrol_set)) + goto unmap; - cmd += length; - } + cmd += length; + } while (cmd < page_end); - if (oacontrol_set) { - DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n"); - ret = -EINVAL; - } + batch_len -= this; + if (batch_len == 0) + break; - if (cmd >= batch_end) { - DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n"); - ret = -EINVAL; + kunmap_atomic(src); + in = 0; } - vunmap(batch_base); - +unmap: + kunmap_atomic(src); + kunmap_atomic(dst); +unpin: + i915_gem_object_unpin_pages(batch_obj); return ret; } -- 2.1.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx