If we need to use clflush to prepare our batch for reads from memory, we can bypass the cache instead by using non-temporal copies. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Reviewed-by: Matthew Auld <matthew.william.auld@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_cmd_parser.c | 73 +++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index e128e3ab8452..7d1424e514be 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -965,8 +965,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, { unsigned int src_needs_clflush; unsigned int dst_needs_clflush; - void *dst, *ptr; - int offset, n; + void *dst; int ret; ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush); @@ -983,31 +982,51 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, if (IS_ERR(dst)) goto unpin_dst; - ptr = dst; - offset = offset_in_page(batch_start_offset); - - /* We can avoid clflushing partial cachelines before the write if we - * only every write full cache-lines. Since we know that both the - * source and destination are in multiples of PAGE_SIZE, we can simply - * round up to the next cacheline. We don't care about copying too much - * here as we only validate up to the end of the batch. - */ - if (dst_needs_clflush & CLFLUSH_BEFORE) - batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size); - - for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) { - int len = min_t(int, batch_len, PAGE_SIZE - offset); - void *vaddr; - - vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n)); - if (src_needs_clflush) - drm_clflush_virt_range(vaddr + offset, len); - memcpy(ptr, vaddr + offset, len); - kunmap_atomic(vaddr); - - ptr += len; - batch_len -= len; - offset = 0; + if (src_needs_clflush && + i915_memcpy_from_wc((void *)(uintptr_t)batch_start_offset, 0, 0)) { + void *src; + + src = i915_gem_object_pin_map(src_obj, I915_MAP_WC); + if (IS_ERR(src)) + goto shmem_copy; + + i915_memcpy_from_wc(dst, + src + batch_start_offset, + ALIGN(batch_len, 16)); + i915_gem_object_unpin_map(src_obj); + } else { + void *ptr; + int offset, n; + +shmem_copy: + offset = offset_in_page(batch_start_offset); + + /* We can avoid clflushing partial cachelines before the write + * if we only every write full cache-lines. Since we know that + * both the source and destination are in multiples of + * PAGE_SIZE, we can simply round up to the next cacheline. + * We don't care about copying too much here as we only + * validate up to the end of the batch. + */ + if (dst_needs_clflush & CLFLUSH_BEFORE) + batch_len = roundup(batch_len, + boot_cpu_data.x86_clflush_size); + + ptr = dst; + for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) { + int len = min_t(int, batch_len, PAGE_SIZE - offset); + void *vaddr; + + vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n)); + if (src_needs_clflush) + drm_clflush_virt_range(vaddr + offset, len); + memcpy(ptr, vaddr + offset, len); + kunmap_atomic(vaddr); + + ptr += len; + batch_len -= len; + offset = 0; + } } /* dst_obj is returned with vmap pinned */ -- 2.9.3 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx