Hi,
here's some additional information this patch.
Am 13.02.25 um 17:25 schrieb Thomas Zimmermann:
[...]
+
+ drm_fb_argb8888_to_argb4444(argb4444_dst, argb4444_dst_pitch,
+ shadow_plane_state->data, fb, &damage,
+ &shadow_plane_state->fmtcnv_state);
[...]
+static void drm_fb_argb8888_to_argb4444_line(void *dbuf, const void *sbuf, unsigned int pixels)
+{
+ unsigned int pixels2 = pixels & ~GENMASK_ULL(0, 0);
+ __le32 *dbuf32 = dbuf;
+ __le16 *dbuf16 = dbuf + pixels2 * sizeof(*dbuf16);
+ const __le32 *sbuf32 = sbuf;
+ unsigned int x;
+ u32 val32;
+ u16 val16;
+ u32 pix[2];
+
+ for (x = 0; x < pixels2; x += 2, ++dbuf32) {
+ pix[0] = le32_to_cpu(sbuf32[x]);
+ pix[1] = le32_to_cpu(sbuf32[x + 1]);
+ val32 = ((pix[0] & 0xf0000000) >> 16) |
+ ((pix[0] & 0x00f00000) >> 12) |
+ ((pix[0] & 0x0000f000) >> 8) |
+ ((pix[0] & 0x000000f0) >> 4) |
+ ((pix[1] & 0xf0000000) >> 0) |
+ ((pix[1] & 0x00f00000) << 4) |
+ ((pix[1] & 0x0000f000) << 8) |
+ ((pix[1] & 0x000000f0) << 12);
+ *dbuf32 = cpu_to_le32(val32);
+ }
This loop is an optimization. It converts two source pixels at a time
and writes them with a 32-bit store. When I measured the impact, I was
quite delighted by the results. I measured the time it takes to convert
a full cursor image of 64x64 pixels with drm_fb_argb8888_to_argb4444(),
and then looked at the average.
Without this loop in place, the average runtime stabilizes around 97K
nsecs. Here are the final results
[ 406.420664] ast 0000:02:00.0: [drm] count=8448 average=97239 nsec
[ 414.869034] ast 0000:02:00.0: [drm] count=8704 average=97005 nsec
[ 425.665928] ast 0000:02:00.0: [drm] count=8960 average=97096 nsec
[ 435.185207] ast 0000:02:00.0: [drm] count=9216 average=96711 nsec
[ 442.244948] ast 0000:02:00.0: [drm] count=9472 average=96432 nsec
Count is the number of probes. The time has been taken with ktime_get_ns().
With the additional loop, the values stabilize around 52K nsecs.
[ 348.797840] ast 0000:02:00.0: [drm] count=8448 average=51729
[ 356.503387] ast 0000:02:00.0: [drm] count=8704 average=51680
[ 364.151804] ast 0000:02:00.0: [drm] count=8960 average=51574
[ 372.412221] ast 0000:02:00.0: [drm] count=9216 average=51563
[ 425.158072] ast 0000:02:00.0: [drm] count=9472 average=51674
That's only ~53% of the unoptimized case.
Given these results, I'll try to add similar optimizations to other
format-conversion helpers. Most of the format conversion happens for
drivers with only a single output format, such as simpledrm. For
full-screen pageflips on such drivers, it might even make a visible
difference.
Best regards
Thomas
+ for (; x < pixels; x++) {
+ pix[0] = le32_to_cpu(sbuf32[x]);
+ val16 = ((pix[0] & 0xf0000000) >> 16) |
+ ((pix[0] & 0x00f00000) >> 12) |
+ ((pix[0] & 0x0000f000) >> 8) |
+ ((pix[0] & 0x000000f0) >> 4);
+ dbuf16[x] = cpu_to_le16(val16);
+ }
+}
+
+/**
+ * drm_fb_argb8888_to_argb4444 - Convert ARGB8888 to ARGB4444 clip buffer
+ * @dst: Array of ARGB4444 destination buffers
+ * @dst_pitch: Array of numbers of bytes between the start of two consecutive scanlines
+ * within @dst; can be NULL if scanlines are stored next to each other.
+ * @src: Array of ARGB8888 source buffer
+ * @fb: DRM framebuffer
+ * @clip: Clip rectangle area to copy
+ * @state: Transform and conversion state
+ *
+ * This function copies parts of a framebuffer to display memory and converts
+ * the color format during the process. The parameters @dst, @dst_pitch and
+ * @src refer to arrays. Each array must have at least as many entries as
+ * there are planes in @fb's format. Each entry stores the value for the
+ * format's respective color plane at the same index.
+ *
+ * This function does not apply clipping on @dst (i.e. the destination is at the
+ * top-left corner).
+ *
+ * Drivers can use this function for ARGB4444 devices that don't support
+ * ARGB8888 natively.
+ */
+void drm_fb_argb8888_to_argb4444(struct iosys_map *dst, const unsigned int *dst_pitch,
+ const struct iosys_map *src, const struct drm_framebuffer *fb,
+ const struct drm_rect *clip, struct drm_format_conv_state *state)
+{
+ static const u8 dst_pixsize[DRM_FORMAT_MAX_PLANES] = {
+ 2,
+ };
+
+ drm_fb_xfrm(dst, dst_pitch, dst_pixsize, src, fb, clip, false, state,
+ drm_fb_argb8888_to_argb4444_line);
+}
+EXPORT_SYMBOL(drm_fb_argb8888_to_argb4444);
+
/**
* drm_fb_blit - Copy parts of a framebuffer to display memory
* @dst: Array of display-memory addresses to copy to
diff --git a/include/drm/drm_format_helper.h b/include/drm/drm_format_helper.h
index 428d81afe215..a1347e47e9d5 100644
--- a/include/drm/drm_format_helper.h
+++ b/include/drm/drm_format_helper.h
@@ -110,6 +110,9 @@ void drm_fb_xrgb8888_to_argb2101010(struct iosys_map *dst, const unsigned int *d
void drm_fb_xrgb8888_to_gray8(struct iosys_map *dst, const unsigned int *dst_pitch,
const struct iosys_map *src, const struct drm_framebuffer *fb,
const struct drm_rect *clip, struct drm_format_conv_state *state);
+void drm_fb_argb8888_to_argb4444(struct iosys_map *dst, const unsigned int *dst_pitch,
+ const struct iosys_map *src, const struct drm_framebuffer *fb,
+ const struct drm_rect *clip, struct drm_format_conv_state *state);
int drm_fb_blit(struct iosys_map *dst, const unsigned int *dst_pitch, uint32_t dst_format,
const struct iosys_map *src, const struct drm_framebuffer *fb,
--
--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)