On to, 2016-06-09 at 12:29 +0100, Chris Wilson wrote: > This is a companion to i915_gem_obj_prepare_shmem_read() that prepares > the backing storage for direct writes. It first serialises with the GPU, > pins the backing storage and then indicates what clfushes are required in > order for the writes to be coherent. > > Whilst here, fix support for ancient CPUs without clflush for which we > cannot do the GTT+clflush tricks. > Could add here that WARN is kicked > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_cmd_parser.c | 2 +- > drivers/gpu/drm/i915/i915_drv.h | 6 +- > drivers/gpu/drm/i915/i915_gem.c | 119 +++++++++++++++++++++------------ > 3 files changed, 83 insertions(+), 44 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c > index b0fd6a7b0603..de038da6c01b 100644 > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > @@ -970,7 +970,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dest_obj, > u32 batch_start_offset, > u32 batch_len) > { > - int needs_clflush = 0; > + unsigned needs_clflush; Mildly irritated by innuendo to boolean. Not sure why not just "flags". > void *src_base, *src; > void *dst = NULL; > int ret; > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index b333cf4923bc..ef321f84e5fc 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -3026,7 +3026,11 @@ void i915_gem_release_all_mmaps(struct drm_i915_private *dev_priv); > void i915_gem_release_mmap(struct drm_i915_gem_object *obj); > > int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj, > - int *needs_clflush); > + unsigned *needs_clflush); > +int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj, > + unsigned *needs_clflush); > +#define CLFLUSH_BEFORE 0x1 > +#define CLFLUSH_AFTER 0x2 > > int __must_check i915_gem_object_get_pages(struct drm_i915_gem_object *obj); > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index a41fa01eb024..c6d06cb21191 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -563,34 +563,95 @@ __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset, > * flush the object from the CPU cache. > */ > int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj, > - int *needs_clflush) > + unsigned *needs_clflush) > { > int ret; > > *needs_clflush = 0; > - > - if (WARN_ON((obj->ops->flags & I915_GEM_OBJECT_HAS_STRUCT_PAGE) == 0)) > + if ((obj->ops->flags & I915_GEM_OBJECT_HAS_STRUCT_PAGE) == 0) Why no more WARN? > return -EINVAL; > > if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) { > + ret = i915_gem_object_wait_rendering(obj, true); > + if (ret) > + return ret; > + > /* If we're not in the cpu read domain, set ourself into the gtt > * read domain and manually flush cachelines (if required). This > * optimizes for the case when the gpu will dirty the data > * anyway again before the next pread happens. */ > *needs_clflush = !cpu_cache_is_coherent(obj->base.dev, > obj->cache_level); Maybe add CLFLUSH_NONE and use ?:, this is deceiving now that it's not boolean, and that's why the name should be changed too. > - ret = i915_gem_object_wait_rendering(obj, true); > + } > + > + ret = i915_gem_object_get_pages(obj); > + if (ret) > + return ret; > + > + i915_gem_object_pin_pages(obj); > + > + if (*needs_clflush && !boot_cpu_has(X86_FEATURE_CLFLUSH)) { Your own comment applies here. > + ret = i915_gem_object_set_to_cpu_domain(obj, false); > + if (ret) { > + i915_gem_object_unpin_pages(obj); > + return ret; > + } > + *needs_clflush = 0; > + } > + > + return 0; > +} > + > +int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj, > + unsigned *needs_clflush) > +{ > + int ret; > + > + *needs_clflush = 0; > + if ((obj->ops->flags & I915_GEM_OBJECT_HAS_STRUCT_PAGE) == 0) > + return -EINVAL; > + > + if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) { > + ret = i915_gem_object_wait_rendering(obj, false); > if (ret) > return ret; > + > + /* If we're not in the cpu write domain, set ourself into the > + * gtt write domain and manually flush cachelines (as required). > + * This optimizes for the case when the gpu will use the data > + * right away and we therefore have to clflush anyway. > + */ > + *needs_clflush |= cpu_write_needs_clflush(obj) << 1; > } > > + /* Same trick applies to invalidate partially written cachelines read > + * before writing. > + */ > + if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) > + *needs_clflush |= !cpu_cache_is_coherent(obj->base.dev, > + obj->cache_level); > + > ret = i915_gem_object_get_pages(obj); > if (ret) > return ret; > > i915_gem_object_pin_pages(obj); > > - return ret; > + if (*needs_clflush && !boot_cpu_has(X86_FEATURE_CLFLUSH)) { > + ret = i915_gem_object_set_to_cpu_domain(obj, true); > + if (ret) { > + i915_gem_object_unpin_pages(obj); > + return ret; > + } > + *needs_clflush = 0; > + } Relatively large common mid-section in these two, static func? Otherwise looks fine to me, those explained or fixed; Reviewed-by: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> Regards, Joonas > + > + if ((*needs_clflush & CLFLUSH_AFTER) == 0) > + obj->cache_dirty = true; > + > + intel_fb_obj_invalidate(obj, ORIGIN_CPU); > + obj->dirty = 1; > + return 0; > + } > > /* Per-page copy function for the shmem pread fastpath. > @@ -999,41 +1060,17 @@ i915_gem_shmem_pwrite(struct drm_device *dev, > int shmem_page_offset, page_length, ret = 0; > int obj_do_bit17_swizzling, page_do_bit17_swizzling; > int hit_slowpath = 0; > - int needs_clflush_after = 0; > - int needs_clflush_before = 0; > + unsigned needs_clflush; > struct sg_page_iter sg_iter; > > - user_data = u64_to_user_ptr(args->data_ptr); > - remain = args->size; > - > - obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj); > - > - if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) { > - /* If we're not in the cpu write domain, set ourself into the gtt > - * write domain and manually flush cachelines (if required). This > - * optimizes for the case when the gpu will use the data > - * right away and we therefore have to clflush anyway. */ > - needs_clflush_after = cpu_write_needs_clflush(obj); > - ret = i915_gem_object_wait_rendering(obj, false); > - if (ret) > - return ret; > - } > - /* Same trick applies to invalidate partially written cachelines read > - * before writing. */ > - if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) > - needs_clflush_before = > - !cpu_cache_is_coherent(dev, obj->cache_level); > - > - ret = i915_gem_object_get_pages(obj); > + ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush); > if (ret) > return ret; > > - intel_fb_obj_invalidate(obj, ORIGIN_CPU); > - > - i915_gem_object_pin_pages(obj); > - > + obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj); > + user_data = u64_to_user_ptr(args->data_ptr); > offset = args->offset; > - obj->dirty = 1; > + remain = args->size; > > for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, > offset >> PAGE_SHIFT) { > @@ -1057,7 +1094,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, > /* If we don't overwrite a cacheline completely we need to be > * careful to have up-to-date data by first clflushing. Don't > * overcomplicate things and flush the entire patch. */ > - partial_cacheline_write = needs_clflush_before && > + partial_cacheline_write = needs_clflush & CLFLUSH_BEFORE && > ((shmem_page_offset | page_length) > & (boot_cpu_data.x86_clflush_size - 1)); > > @@ -1067,7 +1104,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, > ret = shmem_pwrite_fast(page, shmem_page_offset, page_length, > user_data, page_do_bit17_swizzling, > partial_cacheline_write, > - needs_clflush_after); > + needs_clflush & CLFLUSH_AFTER); > if (ret == 0) > goto next_page; > > @@ -1076,7 +1113,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev, > ret = shmem_pwrite_slow(page, shmem_page_offset, page_length, > user_data, page_do_bit17_swizzling, > partial_cacheline_write, > - needs_clflush_after); > + needs_clflush & CLFLUSH_AFTER); > > mutex_lock(&dev->struct_mutex); > > @@ -1098,17 +1135,15 @@ out: > * cachelines in-line while writing and the object moved > * out of the cpu write domain while we've dropped the lock. > */ > - if (!needs_clflush_after && > + if (!(needs_clflush & CLFLUSH_AFTER) && > obj->base.write_domain != I915_GEM_DOMAIN_CPU) { > if (i915_gem_clflush_object(obj, obj->pin_display)) > - needs_clflush_after = true; > + needs_clflush |= CLFLUSH_AFTER; > } > } > > - if (needs_clflush_after) > + if (needs_clflush & CLFLUSH_AFTER) > i915_gem_chipset_flush(to_i915(dev)); > - else > - obj->cache_dirty = true; > > intel_fb_obj_flush(obj, false, ORIGIN_CPU); > return ret; -- Joonas Lahtinen Open Source Technology Center Intel Corporation _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx