Re: [PATCH v5 18/44] drm/vkms: Use s32 for internal color pipeline precision

Louis Chauvet <louis.chauvet@xxxxxxxxxxx> · Tue, 27 Aug 2024 19:49:50 +0200

Le 19/08/24 - 16:56, Harry Wentland a écrit :
> Certain operations require us to preserve values below 0.0 and
> above 1.0 (0x0 and 0xffff respectively in 16 bpc unorm). One
> such operation is a BT709 encoding operation followed by its
> decoding operation, or the reverse.
> 
> We'll use s32 values as intermediate in and outputs of our
> color operations, for the operations where it matters.
>
> For now this won't apply to LUT operations. We might want to
> update those to work on s32 as well, but it's unclear how
> that should work for unorm LUT definitions. We'll revisit
> that once we add LUT + CTM tests.
> 
> In order to allow for this we'll also invert the nesting of our
> colorop processing loops. We now use the pixel iteration loop
> on the outside and the colorop iteration on the inside.

Maybe you can do this inversion on your first commit so it will reduce the 
code change here?

> v4:
>  - Clarify that we're packing 16-bit UNORM into s32, not
>    converting values to a different representation (Pekka)
> 
> v3:
>  - Use new colorop->next pointer
> 
> Signed-off-by: Harry Wentland <harry.wentland@xxxxxxx>
> ---
>  drivers/gpu/drm/vkms/vkms_composer.c | 55 ++++++++++++++++++++++------
>  drivers/gpu/drm/vkms/vkms_drv.h      |  4 ++
>  2 files changed, 47 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c
> index bc116d16e378..6e939d3a6d5c 100644
> --- a/drivers/gpu/drm/vkms/vkms_composer.c
> +++ b/drivers/gpu/drm/vkms/vkms_composer.c
> @@ -164,7 +164,7 @@ static void apply_lut(const struct vkms_crtc_state *crtc_state, struct line_buff
>  	}
>  }
>  
> -static void apply_colorop(struct pixel_argb_u16 *pixel, struct drm_colorop *colorop)
> +static void apply_colorop(struct pixel_argb_s32 *pixel, struct drm_colorop *colorop)
>  {
>  	struct drm_colorop_state *colorop_state = colorop->state;
>  
> @@ -190,24 +190,55 @@ static void apply_colorop(struct pixel_argb_u16 *pixel, struct drm_colorop *colo
>  
>  static void pre_blend_color_transform(const struct vkms_plane_state *plane_state, struct line_buffer *output_buffer)
>  {
> -	struct drm_colorop *colorop = plane_state->base.base.color_pipeline;
> +	struct drm_colorop *colorop;
> +	struct pixel_argb_s32 pixel;
>  
> -	while (colorop) {
> -		struct drm_colorop_state *colorop_state;
> +	for (size_t x = 0; x < output_buffer->n_pixels; x++) {
>  
> -		if (!colorop)
> -			return;
> +		/*
> +		 * Some operations, such as applying a BT709 encoding matrix,
> +		 * followed by a decoding matrix, require that we preserve
> +		 * values above 1.0 and below 0.0 until the end of the pipeline.
> +		 *
> +		 * Pack the 16-bit UNORM values into s32 to give us head-room to
> +		 * avoid clipping until we're at the end of the pipeline. Clip
> +		 * intentionally at the end of the pipeline before packing
> +		 * UNORM values back into u16.
> +		 */
> +		pixel.a = output_buffer->pixels[x].a;
> +		pixel.r = output_buffer->pixels[x].r;
> +		pixel.g = output_buffer->pixels[x].g;
> +		pixel.b = output_buffer->pixels[x].b;
>
> -		colorop_state = colorop->state;
> +		colorop = plane_state->base.base.color_pipeline;
> +		while (colorop) {
> +			struct drm_colorop_state *colorop_state;
>  
> -		if (!colorop_state)
> -			return;
> +			if (!colorop)
> +				return;

I think this is useless, the while should be sufficient.

> +
> +			colorop_state = colorop->state;
> +
> +			if (!colorop_state)
> +				return;
>  
> -		for (size_t x = 0; x < output_buffer->n_pixels; x++)
>  			if (!colorop_state->bypass)
> -				apply_colorop(&output_buffer->pixels[x], colorop);
> +				apply_colorop(&pixel, colorop);
>  
> -		colorop = colorop->next;
> +			colorop = colorop->next;
> +		}
> +
> +		/* clamp pixel */
> +		pixel.a = max(min(pixel.a, 0xffff), 0x0);
> +		pixel.r = max(min(pixel.r, 0xffff), 0x0);
> +		pixel.g = max(min(pixel.g, 0xffff), 0x0);
> +		pixel.b = max(min(pixel.b, 0xffff), 0x0);

clamp can't be used here? And can't we store the result directly in 
output_buffer?

> +
> +		/* put back to output_buffer */
> +		output_buffer->pixels[x].a = pixel.a;
> +		output_buffer->pixels[x].r = pixel.r;
> +		output_buffer->pixels[x].g = pixel.g;
> +		output_buffer->pixels[x].b = pixel.b;
>
>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/vkms/vkms_drv.h b/drivers/gpu/drm/vkms/vkms_drv.h
> index 278cf3533f58..b78bc2611746 100644
> --- a/drivers/gpu/drm/vkms/vkms_drv.h
> +++ b/drivers/gpu/drm/vkms/vkms_drv.h
> @@ -36,6 +36,10 @@ struct vkms_frame_info {
>  	unsigned int cpp;
>  };
>  
> +struct pixel_argb_s32 {
> +	s32 a, r, g, b;
> +};
> +
>  struct pixel_argb_u16 {
>  	u16 a, r, g, b;
>  };
> -- 
> 2.46.0
>