vmalloc BO's gives us cached reads, so no need to prefetch in that case. Prefetching gives a ~20% speedup on a cma buffer using the mi0283qt driver on a Raspberry Pi 1. Signed-off-by: Noralf Trønnes <noralf@xxxxxxxxxxx> --- drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c | 54 ++++++++++++++------------ 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c index ee9a8f305b26..bca905213cdd 100644 --- a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c +++ b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c @@ -15,6 +15,8 @@ #include <linux/swab.h> #include <drm/drmP.h> +#include <drm/drm_gem.h> +#include <drm/drm_gem_framebuffer_helper.h> #include <drm/tinydrm/tinydrm.h> #include <drm/tinydrm/tinydrm-helpers.h> @@ -115,22 +117,25 @@ void tinydrm_swab16(u16 *dst, void *vaddr, struct drm_framebuffer *fb, struct drm_clip_rect *clip) { size_t len = (clip->x2 - clip->x1) * sizeof(u16); + u16 *src, *buf = NULL; unsigned int x, y; - u16 *src, *buf; /* - * The cma memory is write-combined so reads are uncached. - * Speed up by fetching one line at a time. + * Imported buffers are likely to be write-combined with uncached + * reads. Speed up by fetching one line at a time. + * prefetch_range() was tried, but didn't give any noticeable speedup + * on the Raspberry Pi 1. */ - buf = kmalloc(len, GFP_KERNEL); - if (!buf) - return; + if (drm_gem_fb_get_obj(fb, 0)->import_attach) + buf = kmalloc(len, GFP_KERNEL); for (y = clip->y1; y < clip->y2; y++) { src = vaddr + (y * fb->pitches[0]); src += clip->x1; - memcpy(buf, src, len); - src = buf; + if (buf) { + memcpy(buf, src, len); + src = buf; + } for (x = clip->x1; x < clip->x2; x++) *dst++ = swab16(*src++); } @@ -155,19 +160,21 @@ void tinydrm_xrgb8888_to_rgb565(u16 *dst, void *vaddr, struct drm_clip_rect *clip, bool swap) { size_t len = (clip->x2 - clip->x1) * sizeof(u32); + u32 *src, *buf = NULL; unsigned int x, y; - u32 *src, *buf; u16 val16; - buf = kmalloc(len, GFP_KERNEL); - if (!buf) - return; + /* See tinydrm_swab16() for an explanation */ + if (drm_gem_fb_get_obj(fb, 0)->import_attach) + buf = kmalloc(len, GFP_KERNEL); for (y = clip->y1; y < clip->y2; y++) { src = vaddr + (y * fb->pitches[0]); src += clip->x1; - memcpy(buf, src, len); - src = buf; + if (buf) { + memcpy(buf, src, len); + src = buf; + } for (x = clip->x1; x < clip->x2; x++) { val16 = ((*src & 0x00F80000) >> 8) | ((*src & 0x0000FC00) >> 5) | @@ -205,24 +212,23 @@ void tinydrm_xrgb8888_to_gray8(u8 *dst, void *vaddr, struct drm_framebuffer *fb, { unsigned int len = (clip->x2 - clip->x1) * sizeof(u32); unsigned int x, y; - void *buf; + void *buf = NULL; u32 *src; if (WARN_ON(fb->format->format != DRM_FORMAT_XRGB8888)) return; - /* - * The cma memory is write-combined so reads are uncached. - * Speed up by fetching one line at a time. - */ - buf = kmalloc(len, GFP_KERNEL); - if (!buf) - return; + + /* See tinydrm_swab16() for an explanation */ + if (drm_gem_fb_get_obj(fb, 0)->import_attach) + buf = kmalloc(len, GFP_KERNEL); for (y = clip->y1; y < clip->y2; y++) { src = vaddr + (y * fb->pitches[0]); src += clip->x1; - memcpy(buf, src, len); - src = buf; + if (buf) { + memcpy(buf, src, len); + src = buf; + } for (x = clip->x1; x < clip->x2; x++) { u8 r = (*src & 0x00ff0000) >> 16; u8 g = (*src & 0x0000ff00) >> 8; -- 2.14.2 _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel