On 08.02.2022 11:11, Lucas De Marchi wrote: > On Mon, Feb 07, 2022 at 09:43:08PM +0530, Balasubramani Vivekanandan wrote: > > memcpy_from_wc functions can fail if SSE4.1 is not supported or the > > supplied addresses are not 16-byte aligned. It was then upto to the > > caller to use memcpy as fallback. > > Now fallback to memcpy is implemented inside memcpy_from_wc functions > > relieving the user from checking the return value of i915_memcpy_from_wc > > and doing fallback. > > > > When doing copying from io memory address memcpy_fromio should be used > > as fallback. So a new function is added to the family of memcpy_to_wc > > functions which should be used while copying from io memory. > > > > This change is implemented also with an intention to perpare for porting > > memcpy_from_wc code to ARM64. Since SSE4.1 is not valid for ARM, > > accelerated reads will not be supported and the driver should rely on > > fallback always. > > So there would be few more places in the code where fallback should be > > introduced. For e.g. GuC log relay is currently not using fallback since > > a GPU supporting GuC submission will mostly have SSE4.1 enabled CPU. > > This is no more valid with Discrete GPU and with enabling support for > > ARM64. > > With fallback moved inside memcpy_from_wc function, call sites would > > look neat and fallback can be implemented in a uniform way. > > > > Signed-off-by: Balasubramani Vivekanandan <balasubramani.vivekanandan@xxxxxxxxx> > > --- > > drivers/gpu/drm/i915/gem/i915_gem_object.c | 5 +- > > drivers/gpu/drm/i915/gt/selftest_reset.c | 8 ++- > > drivers/gpu/drm/i915/i915_gpu_error.c | 9 ++- > > drivers/gpu/drm/i915/i915_memcpy.c | 78 ++++++++++++++++------ > > drivers/gpu/drm/i915/i915_memcpy.h | 18 ++--- > > 5 files changed, 78 insertions(+), 40 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c > > index e03e362d320b..e187c4bfb7e4 100644 > > --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c > > @@ -444,7 +444,7 @@ static void > > i915_gem_object_read_from_page_iomap(struct drm_i915_gem_object *obj, u64 offset, void *dst, int size) > > { > > void __iomem *src_map; > > - void __iomem *src_ptr; > > + const void __iomem *src_ptr; > > dma_addr_t dma = i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT); > > > > src_map = io_mapping_map_wc(&obj->mm.region->iomap, > > @@ -452,8 +452,7 @@ i915_gem_object_read_from_page_iomap(struct drm_i915_gem_object *obj, u64 offset > > PAGE_SIZE); > > > > src_ptr = src_map + offset_in_page(offset); > > - if (!i915_memcpy_from_wc(dst, (void __force *)src_ptr, size)) > > - memcpy_fromio(dst, src_ptr, size); > > + i915_io_memcpy_from_wc(dst, src_ptr, size); > > nitpick, but maybe to align with the memcpy_fromio() API this would > better be named i915_memcpy_fromio_wc()? I too thought for a moment should I rename to i915_memcpy_fromio_wc() but stayed with the current name, when preparing the patch. I will rename it. > > > > > io_mapping_unmap(src_map); > > } > > diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c > > index 37c38bdd5f47..64b8521a8b28 100644 > > --- a/drivers/gpu/drm/i915/gt/selftest_reset.c > > +++ b/drivers/gpu/drm/i915/gt/selftest_reset.c > > @@ -99,8 +99,10 @@ __igt_reset_stolen(struct intel_gt *gt, > > memset_io(s, STACK_MAGIC, PAGE_SIZE); > > > > in = (void __force *)s; > > - if (i915_memcpy_from_wc(tmp, in, PAGE_SIZE)) > > + if (i915_can_memcpy_from_wc(tmp, in, PAGE_SIZE)) { > > + i915_io_memcpy_from_wc(tmp, in, PAGE_SIZE); > > in = tmp; > > + } > > crc[page] = crc32_le(0, in, PAGE_SIZE); > > > > io_mapping_unmap(s); > > @@ -135,8 +137,10 @@ __igt_reset_stolen(struct intel_gt *gt, > > PAGE_SIZE); > > > > in = (void __force *)s; > > - if (i915_memcpy_from_wc(tmp, in, PAGE_SIZE)) > > + if (i915_can_memcpy_from_wc(tmp, in, PAGE_SIZE)) { > > + i915_io_memcpy_from_wc(tmp, in, PAGE_SIZE); > > but you removed __iomem above Yeah, it is a mistake. I will change it. There is one more place in the same file which needs correction. Regards, Bala > > > in = tmp; > > + } > > x = crc32_le(0, in, PAGE_SIZE); > > > > if (x != crc[page] && > > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > > index 127ff56c8ce6..2c14a28cbbbb 100644 > > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > > @@ -297,8 +297,10 @@ static int compress_page(struct i915_vma_compress *c, > > struct z_stream_s *zstream = &c->zstream; > > > > zstream->next_in = src; > > - if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) > > + if (wc && c->tmp && i915_can_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) { > > + i915_io_memcpy_from_wc(c->tmp, (const void __iomem *)src, PAGE_SIZE); > > zstream->next_in = c->tmp; > > + } > > zstream->avail_in = PAGE_SIZE; > > > > do { > > @@ -397,8 +399,11 @@ static int compress_page(struct i915_vma_compress *c, > > if (!ptr) > > return -ENOMEM; > > > > - if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE))) > > + if (wc) > > + i915_io_memcpy_from_wc(ptr, src, PAGE_SIZE); > > + else > > memcpy(ptr, src, PAGE_SIZE); > > + > > list_add_tail(&virt_to_page(ptr)->lru, &dst->page_list); > > cond_resched(); > > > > diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c > > index 1b021a4902de..4d9fbf3b2614 100644 > > --- a/drivers/gpu/drm/i915/i915_memcpy.c > > +++ b/drivers/gpu/drm/i915/i915_memcpy.c > > @@ -24,15 +24,10 @@ > > > > #include <linux/kernel.h> > > #include <asm/fpu/api.h> > > +#include <linux/io.h> > > > > #include "i915_memcpy.h" > > > > -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG) > > -#define CI_BUG_ON(expr) BUG_ON(expr) > > -#else > > -#define CI_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr) > > -#endif > > - > > static DEFINE_STATIC_KEY_FALSE(has_movntdqa); > > > > static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len) > > @@ -93,6 +88,26 @@ static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len) > > kernel_fpu_end(); > > } > > > > +/* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment, > > + * as well as SSE4.1 support. To check beforehand, pass in the parameters to > > + * i915_can_memcpy_from_wc() - since we only care about the low 4 bits, > > + * you only need to pass in the minor offsets, page-aligned pointers are > > + * always valid. > > + * > > + * For just checking for SSE4.1, in the foreknowledge that the future use > > + * will be correctly aligned, just use i915_has_memcpy_from_wc(). > > + */ > > +bool i915_can_memcpy_from_wc(void *dst, const void *src, unsigned long len) > > +{ > > + if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15)) > > + return false; > > + > > + if (static_branch_likely(&has_movntdqa)) > > + return true; > > + > > + return false; > > +} > > + > > /** > > * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC > > * @dst: destination pointer > > @@ -104,24 +119,18 @@ static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len) > > * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple > > * of 16. > > * > > - * To test whether accelerated reads from WC are supported, use > > - * i915_memcpy_from_wc(NULL, NULL, 0); > > - * > > - * Returns true if the copy was successful, false if the preconditions > > - * are not met. > > + * If the acccelerated read from WC is not possible fallback to memcpy > > */ > > -bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len) > > +void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len) > > { > > - if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15)) > > - return false; > > - > > - if (static_branch_likely(&has_movntdqa)) { > > + if (i915_can_memcpy_from_wc(dst, src, len)) { > > if (likely(len)) > > __memcpy_ntdqa(dst, src, len >> 4); > > - return true; > > + return; > > } > > > > - return false; > > + /* Fallback */ > > + memcpy(dst, src, len); > > } > > > > /** > > @@ -134,12 +143,15 @@ bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len) > > * @src to @dst using * non-temporal instructions where available, but > > * accepts that its arguments may not be aligned, but are valid for the > > * potential 16-byte read past the end. > > + * > > + * Fallback to memcpy if accelerated read is not supported > > */ > > void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len) > > { > > unsigned long addr; > > > > - CI_BUG_ON(!i915_has_memcpy_from_wc()); > > + if (!i915_has_memcpy_from_wc()) > > + goto fallback; > > > > addr = (unsigned long)src; > > if (!IS_ALIGNED(addr, 16)) { > > @@ -154,6 +166,34 @@ void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len > > > > if (likely(len)) > > __memcpy_ntdqu(dst, src, DIV_ROUND_UP(len, 16)); > > + > > + return; > > + > > +fallback: > > + memcpy(dst, src, len); > > +} > > + > > +/** > > + * i915_io_memcpy_from_wc: perform an accelerated *aligned* read from WC > > + * @dst: destination pointer > > + * @src: source pointer > > + * @len: how many bytes to copy > > + * > > + * To be used when the when copying from io memory. > > + * > > + * memcpy_fromio() is used as fallback otherewise no difference to > > + * i915_memcpy_from_wc() > > + */ > > +void i915_io_memcpy_from_wc(void *dst, const void __iomem *src, unsigned long len) > > +{ > > + if (i915_can_memcpy_from_wc(dst, (const void __force *)src, len)) { > > + if (likely(len)) > > + __memcpy_ntdqa(dst, (const void __force *)src, len >> 4); > > + return; > > + } > > + > > + /* Fallback */ > > + memcpy_fromio(dst, src, len); > > } > > > > void i915_memcpy_init_early(struct drm_i915_private *dev_priv) > > diff --git a/drivers/gpu/drm/i915/i915_memcpy.h b/drivers/gpu/drm/i915/i915_memcpy.h > > index 3df063a3293b..93ea9295e28c 100644 > > --- a/drivers/gpu/drm/i915/i915_memcpy.h > > +++ b/drivers/gpu/drm/i915/i915_memcpy.h > > @@ -12,23 +12,13 @@ struct drm_i915_private; > > > > void i915_memcpy_init_early(struct drm_i915_private *i915); > > > > -bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len); > > +void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len); > > void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len); > > +void i915_io_memcpy_from_wc(void *dst, const void __iomem *src, unsigned long len); > > > > -/* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment, > > - * as well as SSE4.1 support. i915_memcpy_from_wc() will report if it cannot > > - * perform the operation. To check beforehand, pass in the parameters to > > - * to i915_can_memcpy_from_wc() - since we only care about the low 4 bits, > > - * you only need to pass in the minor offsets, page-aligned pointers are > > - * always valid. > > - * > > - * For just checking for SSE4.1, in the foreknowledge that the future use > > - * will be correctly aligned, just use i915_has_memcpy_from_wc(). > > - */ > > -#define i915_can_memcpy_from_wc(dst, src, len) \ > > - i915_memcpy_from_wc((void *)((unsigned long)(dst) | (unsigned long)(src) | (len)), NULL, 0) > > +bool i915_can_memcpy_from_wc(void *dst, const void *src, unsigned long len); > > > > #define i915_has_memcpy_from_wc() \ > > - i915_memcpy_from_wc(NULL, NULL, 0) > > + i915_can_memcpy_from_wc(NULL, NULL, 0) > > I think the has vs can here is confusing. But a cleanup on that could be > on top since it would just add noise to this patch. > > I or someone else probably need a more careful review, but ack on the > direction: > > > > Acked-by: Lucas De Marchi <lucas.demarchi@xxxxxxxxx> > > > Lucas De Marchi