Hi Thomas, On 23.02.2022 20:38, Thomas Zimmermann wrote: > Improve the performance of cfb_imageblit() by manually unrolling > the inner blitting loop and moving some invariants out. The compiler > failed to do this automatically. This change keeps cfb_imageblit() > in sync with sys_imagebit(). > > A microbenchmark measures the average number of CPU cycles > for cfb_imageblit() after a stabilizing period of a few minutes > (i7-4790, FullHD, simpledrm, kernel with debugging). > > cfb_imageblit(), new: 15724 cycles > cfb_imageblit(): old: 30566 cycles > > In the optimized case, cfb_imageblit() is now ~2x faster than before. > > v3: > * fix commit description (Pekka) > > Signed-off-by: Thomas Zimmermann <tzimmermann@xxxxxxx> > Acked-by: Sam Ravnborg <sam@xxxxxxxxxxxx> > Reviewed-by: Javier Martinez Canillas <javierm@xxxxxxxxxx> This patch landed recently in linux next-20220308 as commit 0d03011894d2 ("fbdev: Improve performance of cfb_imageblit()"). Sadly it causes a freeze after DRM and emulated fbdev initialization on various Samsung Exynos ARM 32bit based boards. This happens when kernel is compiled from exynos_defconfig. Surprisingly when kernel is compiled from multi_v7_defconfig all those boards boot fine, so this is a matter of one of the debugging options enabled in the exynos_defconfig. I will try to analyze this further and share the results. Reverting $subject on top of next-20220308 fixes the boot issue. > --- > drivers/video/fbdev/core/cfbimgblt.c | 51 +++++++++++++++++++++++----- > 1 file changed, 42 insertions(+), 9 deletions(-) > > diff --git a/drivers/video/fbdev/core/cfbimgblt.c b/drivers/video/fbdev/core/cfbimgblt.c > index 01b01a279681..7361cfabdd85 100644 > --- a/drivers/video/fbdev/core/cfbimgblt.c > +++ b/drivers/video/fbdev/core/cfbimgblt.c > @@ -218,23 +218,29 @@ static inline void fast_imageblit(const struct fb_image *image, struct fb_info * > { > u32 fgx = fgcolor, bgx = bgcolor, bpp = p->var.bits_per_pixel; > u32 ppw = 32/bpp, spitch = (image->width + 7)/8; > - u32 bit_mask, end_mask, eorx, shift; > + u32 bit_mask, eorx; > const char *s = image->data, *src; > u32 __iomem *dst; > const u32 *tab = NULL; > + size_t tablen; > + u32 colortab[16]; > int i, j, k; > > switch (bpp) { > case 8: > tab = fb_be_math(p) ? cfb_tab8_be : cfb_tab8_le; > + tablen = 16; > break; > case 16: > tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le; > + tablen = 4; > break; > case 32: > - default: > tab = cfb_tab32; > + tablen = 2; > break; > + default: > + return; > } > > for (i = ppw-1; i--; ) { > @@ -248,15 +254,42 @@ static inline void fast_imageblit(const struct fb_image *image, struct fb_info * > eorx = fgx ^ bgx; > k = image->width/ppw; > > - for (i = image->height; i--; ) { > - dst = (u32 __iomem *) dst1, shift = 8; src = s; > + for (i = 0; i < tablen; ++i) > + colortab[i] = (tab[i] & eorx) ^ bgx; > > - for (j = k; j--; ) { > - shift -= ppw; > - end_mask = tab[(*src >> shift) & bit_mask]; > - FB_WRITEL((end_mask & eorx)^bgx, dst++); > - if (!shift) { shift = 8; src++; } > + for (i = image->height; i--; ) { > + dst = (u32 __iomem *)dst1; > + src = s; > + > + switch (ppw) { > + case 4: /* 8 bpp */ > + for (j = k; j; j -= 2, ++src) { > + FB_WRITEL(colortab[(*src >> 4) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 0) & bit_mask], dst++); > + } > + break; > + case 2: /* 16 bpp */ > + for (j = k; j; j -= 4, ++src) { > + FB_WRITEL(colortab[(*src >> 6) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 4) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 2) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 0) & bit_mask], dst++); > + } > + break; > + case 1: /* 32 bpp */ > + for (j = k; j; j -= 8, ++src) { > + FB_WRITEL(colortab[(*src >> 7) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 6) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 5) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 4) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 3) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 2) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 1) & bit_mask], dst++); > + FB_WRITEL(colortab[(*src >> 0) & bit_mask], dst++); > + } > + break; > } > + > dst1 += p->fix.line_length; > s += spitch; > } Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland