Re: [PATCH 4/7] drm/meson: plane: add support for AFBC mode for OSD1 plane

Brian Starkey <Brian.Starkey@xxxxxxx> · Fri, 11 Oct 2019 10:56:00 +0000

Hi,

On Fri, Oct 11, 2019 at 11:14:43AM +0200, Neil Armstrong wrote:
> Hi Brian,
> 
> On 11/10/2019 10:41, Brian Starkey wrote:
> > Hi Neil,
> > 
> > On Thu, Oct 10, 2019 at 03:41:15PM +0200, Neil Armstrong wrote:
> >> Hi Ayan,
> >>
> >> On 10/10/2019 15:26, Ayan Halder wrote:
> >>> On Thu, Oct 10, 2019 at 11:25:23AM +0200, Neil Armstrong wrote:
> >>>> This adds all the OSD configuration plumbing to support the AFBC decoders
> >>>> path to display of the OSD1 plane.
> >>>>
> >>>> The Amlogic GXM and G12A AFBC decoders are integrated very differently.
> >>>>
> >>>> The Amlogic GXM has a direct output path to the OSD1 VIU pixel input,
> >>>> because the GXM AFBC decoder seem to be a custom IP developed by Amlogic.
> >>>>
> >>>> On the other side, the Amlogic G12A AFBC decoder seems to be an external
> >>>> IP that emit pixels on an AXI master hooked to a "Mali Unpack" block
> >>>> feeding the OSD1 VIU pixel input.
> >>>> This uses a weird "0x1000000" internal HW physical address on both
> >>>> sides to transfer the pixels.
> >>>>
> >>>> For Amlogic GXM, the supported pixel formats are the same as the normal
> >>>> linear OSD1 mode.
> >>>>
> >>>> On the other side, Amlogic added support for all AFBC v1.2 formats for
> >>>> the G12A AFBC integration.
> >>>>
> >>>> For simplicity, we stick to the already supported formats for now.
> >>>>
> >>>> Signed-off-by: Neil Armstrong <narmstrong@xxxxxxxxxxxx>
> >>>> ---
> >>>>  drivers/gpu/drm/meson/meson_crtc.c  |   2 +
> >>>>  drivers/gpu/drm/meson/meson_drv.h   |   4 +
> >>>>  drivers/gpu/drm/meson/meson_plane.c | 215 ++++++++++++++++++++++++----
> >>>>  3 files changed, 190 insertions(+), 31 deletions(-)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/meson/meson_crtc.c b/drivers/gpu/drm/meson/meson_crtc.c
> >>>> index 57ae1c13d1e6..d478fa232951 100644
> >>>> --- a/drivers/gpu/drm/meson/meson_crtc.c
> >>>> +++ b/drivers/gpu/drm/meson/meson_crtc.c
> >>>> @@ -281,6 +281,8 @@ void meson_crtc_irq(struct meson_drm *priv)
> >>>>  	if (priv->viu.osd1_enabled && priv->viu.osd1_commit) {
> >>>>  		writel_relaxed(priv->viu.osd1_ctrl_stat,
> >>>>  				priv->io_base + _REG(VIU_OSD1_CTRL_STAT));
> >>>> +		writel_relaxed(priv->viu.osd1_ctrl_stat2,
> >>>> +				priv->io_base + _REG(VIU_OSD1_CTRL_STAT2));
> >>>>  		writel_relaxed(priv->viu.osd1_blk0_cfg[0],
> >>>>  				priv->io_base + _REG(VIU_OSD1_BLK0_CFG_W0));
> >>>>  		writel_relaxed(priv->viu.osd1_blk0_cfg[1],
> >>>> diff --git a/drivers/gpu/drm/meson/meson_drv.h b/drivers/gpu/drm/meson/meson_drv.h
> >>>> index 60f13c6f34e5..de25349be8aa 100644
> >>>> --- a/drivers/gpu/drm/meson/meson_drv.h
> >>>> +++ b/drivers/gpu/drm/meson/meson_drv.h
> >>>> @@ -53,8 +53,12 @@ struct meson_drm {
> >>>>  		bool osd1_enabled;
> >>>>  		bool osd1_interlace;
> >>>>  		bool osd1_commit;
> >>>> +		bool osd1_afbcd;
> >>>>  		uint32_t osd1_ctrl_stat;
> >>>> +		uint32_t osd1_ctrl_stat2;
> >>>>  		uint32_t osd1_blk0_cfg[5];
> >>>> +		uint32_t osd1_blk1_cfg4;
> >>>> +		uint32_t osd1_blk2_cfg4;
> >>>>  		uint32_t osd1_addr;
> >>>>  		uint32_t osd1_stride;
> >>>>  		uint32_t osd1_height;
> >>>> diff --git a/drivers/gpu/drm/meson/meson_plane.c b/drivers/gpu/drm/meson/meson_plane.c
> >>>> index 5e798c276037..412941aa8402 100644
> >>>> --- a/drivers/gpu/drm/meson/meson_plane.c
> >>>> +++ b/drivers/gpu/drm/meson/meson_plane.c
> >>>> @@ -23,6 +23,7 @@
> >>>>  #include "meson_plane.h"
> >>>>  #include "meson_registers.h"
> >>>>  #include "meson_viu.h"
> >>>> +#include "meson_osd_afbcd.h"
> >>>>  
> >>>>  /* OSD_SCI_WH_M1 */
> >>>>  #define SCI_WH_M1_W(w)			FIELD_PREP(GENMASK(28, 16), w)
> >>>> @@ -92,12 +93,38 @@ static int meson_plane_atomic_check(struct drm_plane *plane,
> >>>>  						   false, true);
> >>>>  }
> >>>>  
> >>>> +#define MESON_MOD_AFBC_VALID_BITS (AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 |	\
> >>>> +				   AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 |	\
> >>>> +				   AFBC_FORMAT_MOD_YTR |		\
> >>>> +				   AFBC_FORMAT_MOD_SPARSE |		\
> >>>> +				   AFBC_FORMAT_MOD_SPLIT)
> >>>> +
> >>>>  /* Takes a fixed 16.16 number and converts it to integer. */
> >>>>  static inline int64_t fixed16_to_int(int64_t value)
> >>>>  {
> >>>>  	return value >> 16;
> >>>>  }
> >>>>  
> >>>> +static u32 meson_g12a_afbcd_line_stride(struct meson_drm *priv)
> >>>> +{
> >>>> +	u32 line_stride = 0;
> >>>> +
> >>>> +	switch (priv->afbcd.format) {
> >>>> +	case DRM_FORMAT_RGB565:
> >>>> +		line_stride = ((priv->viu.osd1_width << 4) + 127) >> 7;
> >>>> +		break;
> >>>> +	case DRM_FORMAT_RGB888:
> >>>> +	case DRM_FORMAT_XRGB8888:
> >>>> +	case DRM_FORMAT_ARGB8888:
> >>>> +	case DRM_FORMAT_XBGR8888:
> >>>> +	case DRM_FORMAT_ABGR8888:
> >>> Please have a look at
> >>> https://www.kernel.org/doc/html/latest/gpu/afbc.html for our
> >>> recommendation. We suggest that *X* formats are avoided.
> >>>
> >>> Also, for interoperability and maximum compression efficiency (with
> >>> AFBC_FORMAT_MOD_YTR), we suggest the following order :-
> >>>
> >>>         Component 0: R
> >>>         Component 1: G
> >>>         Component 2: B
> >>>         Component 3: A (if available)
> >>
> >>
> >> Sorry I don't understand, you ask me to limit AFBC to ABGR8888 ?
> >>
> >> But why if the HW (GPU and DPU) is capable of ?
> > 
> > AFBC doesn't have an in-memory component order in the traditional
> > sense (i.e. a bit-position to component mapping), so Arm
> > have decided to define the convention that DRM_FORMAT_ABGR8888
> > represents the AFBC layout with R in component 0.
> 
> In this implementation, we handle the ARGB/ABGR as the same mode
> since the AFBC can only represent the layout as "ABGR" anyway.
> 

In this case, with the external AFBC IP, there's a whole extra layer
of potential confusion :-(

The decoder only needs to know the number of components - so
irrespective of what color channel is mapped to what component, it can
always be configured with the same mode for 4-component 32-bit
formats.

For everything to work correctly with YTR, the thing consuming the
output from the decoder must treat component 0 as 'R', but otherwise
it doesn't matter.

Is your HW able to treat the decoder output in different ways? e.g.
mapping component 0 to 'B'? If that's the case, then exposing the
different orders is valid - but only ABGR should allow YTR.

> > 
> > Are you sure the GPU supports other orders? I think any Arm driver
> > will only be producing DRM_FORMATs with "BGR" order e.g. ABGR888>
> > I'm not convinced the GPU HW actually supports any other order, but
> > it's all rather confusing with texture swizzling. What I can tell you
> > for sure is that it _does_ support BGR order (in DRM naming
> > convention).
> 
> Well, since the Bifrost Mali blobs are closed-source and delivered
> by licensees, it's hard to define what is supported from a closed
> GPU HW, closed SW implementation to a closed pixel format implementation.
> 

I hear you. IMO the only way to make any of this clear is to publish
reference data and tests which make sure implementations match each
other. It's something I'm trying to make happen.

> You'll have to tell us if the closed libMali handling AFBC would accept
> ARGB8888 as format to render with AFBC enabled, if not you're right
> I'll discard XRGB8888/ARGB8888 for AFBC buffers completely.
> 
> But it the libMali chooses tt generate an ARGB8888 buffer whatever
> ARGB8888/XRGB8888/ABGR888/XBGR888 is asked, then no I'll keep it that way.
> 

Yeah, I'll try and get clarity on this. It's not at all clear to me
either. When you say "accept ARGB8888 as format to render with AFBC
enabled", which API are you referring to, just so I can be clear? Do
you have an example of some code you're using to render AFBC with the
GPU blob?

In many APIs, there's no real expectation on in-memory component
order, so perhaps there treating them as all the same is acceptable.

However, fourcc + AFBC modifier is explicit in terms of component
order, and so IMO it's very harmful to "ignore" component order in
interfaces using fourcc + AFBC modifier.

There are implementations which support other orders, so ignoring
order will break those implementations. In some cases (Android, maybe
GL), this can be hidden behind "driver magic", but if the API is
fourcc + AFBC modifier, IMO it had better be completely explicit with
no tricks - irrespective of whatever other less-prescriptive APIs do.

> BTW I kept the vendor implementation here, which may be wrong but since
> they have the AFBC IP license and Mali Bifrost GPU license...
> 
> > 
> > If you do choose to expose orders other than BGR/ABGR, then you should
> > certainly not allow YTR to be used with any orders other than
> > BGR/ABGR. The AFBC spec defines YTR as using R in component 0, which
> > Arm has defined as DRM_FORMAT_*BGR* (component 0 in LE LSBs).
> > 
> 
> The MAFBC_FMT_RGBA8888 pixel format is defined in the AFBC decoder,
> which seems to be an ARM IP, the registers documentation is in the
> SoC datasheet at [1] and the formats bits are defined in the patch 3 at [2].
> 
> So it seems the decoder handles only a single type for 32bit RGB buffer
> format, as Amlogic names it MAFBC_FMT_RGBA8888
> 

Hopefully my comments at the beginning of this mail helps clear this
part up a bit.

> For XRGB8888/XBGR8888 we simply "replace" the A component with a fixed
> value in the pixel generator.

That seems correct, so long as the decoder is configured in the
4-component mode.

> 
> [1] https://dl.khadas.com/Hardware/VIM3/Datasheet/S905D3_datasheet_0.2_Wesion.pdf page 772
> [2] https://patchwork.freedesktop.org/patch/335199/?series=67832&rev=1
> 
> >>
> >> Isn't it an userspace choice ? I understand XRGB8888 is a waste
> >> of memory space and compression efficiency, but this is not the
> >> kernel driver's to decide this, right ?
> >>
> > 
> > As long as it's agreed and understood what XRGB8888 means. It must be
> > an AFBC bitstream with 4-components, with B in component 0, G in
> > component 1, R in component 2 and 8 wasted bits in component 3.
> 
> Yes, but this is something userspace must assume, and it's already
> wasted in the linear XRGB8888 format anyway.
> 
> > 
> > I know of HW which treats "XBGR" with AFBC as a 3-component format,
> > which isn't correct but can easily lead to confusion and
> > incompatibility.
> 
> Seems it's not the case here, at least for the G12A SoC family.

That's good :-)

> 
> > 
> >> For interoperability I'll understand recommending a minimal set
> >> of modifiers and formats. But here, each platform is also limited
> >> by it's GPU capabilites aswell.
> >>
> > 
> > The (Arm) GPUs support ABGR ordering, so if everyone sticks to that we
> > can make sure everything's nice and compatible (until someone turns up
> > with HW which _doesn't_ support that ordering).
> 
> This is not clean enough in the https://www.kernel.org/doc/html/latest/gpu/afbc.html
> document. Since ARM is in control of the renderers, saying AFBC does _not_
> support another components format as ABGR ordering in all the
> OpenGL ES/Vulkan implementations, it would be clear we couldn't render
> anything using AFBC with ARGB.
> But we hit the closed-source/closed-specifications here again.
> 

I didn't really understand the middle sentence.

I know and understand that the "closed-ness" is a problem. The page
you linked was an initial attempt at making a clear, public
specification.

What I need to be clear about, though, is that it describes _only_
cases where DRM fourcc + AFBC modifier are used. I don't think there's
any sane way to apply it to other APIs, because the formats are
described differently, and the "leeway" allowed for doing things
"under-the-hood" is very different.

> > 
> >> Limiting to ABGR8888 would discard like every non-Android renderers,
> >> using AFBC, I'm not sure it's the kernels driver's responsibility.
> >>
> > 
> > It prevents renderers with hard-coded pixel formats, perhaps. But
> > those are already fragile by nature, surely?
> 
> Well, except Android, all the other renderers uses ARGB8888/XRGB8888,
> as fixed pixel format, which is quite a large amount of code.
> 

I think whether that matters or not really depends on which graphics
APIs you're referring to. IMO it's inevitable that modifiers don't
simply "drop in" everywhere. The kernel API allows you to query what's
supported and pick that.

Thanks,
-Brian

> 
> Anyway, thanks for these technical clarifications, it makes things
> much more clearer.
> 
> Neil
> 
> > 
> > Cheers,
> > -Brian
> > 
> >>>
> >>> Thus, DRM_FORMAT_ABGR, DRM_FORMAT_BGR should only be allowed.
> >>>> +		line_stride = ((priv->viu.osd1_width << 5) + 127) >> 7;
> >>>> +		break;
> >>>> +	}
> >>>> +
> >>>> +	return ((line_stride + 1) >> 1) << 1;
> >>>> +}
> >>>> +
> >>>>  static void meson_plane_atomic_update(struct drm_plane *plane,
> >>>>  				      struct drm_plane_state *old_state)
> >>>>  {
> >>
> >> [...]
> >>
> >>>>  
> >>>> +static bool meson_plane_format_mod_supported(struct drm_plane *plane,
> >>>> +					     u32 format, u64 modifier)
> >>>> +{
> >>>> +	struct meson_plane *meson_plane = to_meson_plane(plane);
> >>>> +	struct meson_drm *priv = meson_plane->priv;
> >>>> +	int i;
> >>>> +
> >>>> +	if (modifier == DRM_FORMAT_MOD_INVALID)
> >>>> +		return false;
> >>>> +
> >>>> +	if (modifier == DRM_FORMAT_MOD_LINEAR)
> >>>> +		return true;
> >>>> +
> >>>> +	if (!meson_vpu_is_compatible(priv, VPU_COMPATIBLE_GXM) &&
> >>>> +	    !meson_vpu_is_compatible(priv, VPU_COMPATIBLE_G12A))
> >>>> +		return false;
> >>>> +
> >>>> +	if (modifier & ~DRM_FORMAT_MOD_ARM_AFBC(MESON_MOD_AFBC_VALID_BITS))
> >>>> +		return false;
> >>>> +
> >>>> +	for (i = 0 ; i < plane->modifier_count ; ++i)
> >>>> +		if (plane->modifiers[i] == modifier)
> >>>> +			break;
> >>>> +
> >>>> +	if (i == plane->modifier_count) {
> >>>> +		DRM_DEBUG_KMS("Unsupported modifier\n");
> >>>> +		return false;
> >>>> +	}
> >>
> >> I can add a warn_once here, would it be enough ?
> >>
> >>>> +
> >>>> +	if (priv->afbcd.ops && priv->afbcd.ops->supported_fmt)
> >>>> +		return priv->afbcd.ops->supported_fmt(modifier, format);
> >>>> +
> >>>> +	DRM_DEBUG_KMS("AFBC Unsupported\n");
> >>>> +	return false;
> >>>> +}
> >>>> +
> >>>>  static const struct drm_plane_funcs meson_plane_funcs = {
> >>>>  	.update_plane		= drm_atomic_helper_update_plane,
> >>>>  	.disable_plane		= drm_atomic_helper_disable_plane,
> >>>> @@ -353,6 +457,7 @@ static const struct drm_plane_funcs meson_plane_funcs = {
> >>>>  	.reset			= drm_atomic_helper_plane_reset,
> >>>>  	.atomic_duplicate_state = drm_atomic_helper_plane_duplicate_state,
> >>>>  	.atomic_destroy_state	= drm_atomic_helper_plane_destroy_state,
> >>>> +	.format_mod_supported   = meson_plane_format_mod_supported,
> >>>>  };
> >>>>  
> >>>>  static const uint32_t supported_drm_formats[] = {
> >>>> @@ -364,10 +469,53 @@ static const uint32_t supported_drm_formats[] = {
> >>>>  	DRM_FORMAT_RGB565,
> >>>>  };
> >>>>  
> >>>> +static const uint64_t format_modifiers_afbc_gxm[] = {
> >>>> +	DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 |
> >>>> +				AFBC_FORMAT_MOD_SPARSE |
> >>>> +				AFBC_FORMAT_MOD_YTR),
> >>>> +	/* SPLIT mandates SPARSE, RGB modes mandates YTR */
> >>>> +	DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 |
> >>>> +				AFBC_FORMAT_MOD_YTR |
> >>>> +				AFBC_FORMAT_MOD_SPARSE |
> >>>> +				AFBC_FORMAT_MOD_SPLIT),
> >>>> +	DRM_FORMAT_MOD_LINEAR,
> >>>> +	DRM_FORMAT_MOD_INVALID,
> >>>> +};
> >>>> +
> >>>> +static const uint64_t format_modifiers_afbc_g12a[] = {
> >>>> +	/*
> >>>> +	 * - TOFIX Support AFBC modifiers for YUV formats (16x16 + TILED)
> >>>> +	 * - AFBC_FORMAT_MOD_YTR is mandatory since we only support RGB
> >>>> +	 * - SPLIT is mandatory for performances reasons when in 16x16
> >>>> +	 *   block size
> >>>> +	 * - 32x8 block size + SPLIT is mandatory with 4K frame size
> >>>> +	 *   for performances reasons
> >>>> +	 */
> >>>> +	DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_16x16 |
> >>>> +				AFBC_FORMAT_MOD_YTR |
> >>>> +				AFBC_FORMAT_MOD_SPARSE |
> >>>> +				AFBC_FORMAT_MOD_SPLIT),
> >>>> +	DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 |
> >>>> +				AFBC_FORMAT_MOD_YTR |
> >>>> +				AFBC_FORMAT_MOD_SPARSE),
> >>>> +	DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 |
> >>>> +				AFBC_FORMAT_MOD_YTR |
> >>>> +				AFBC_FORMAT_MOD_SPARSE |
> >>>> +				AFBC_FORMAT_MOD_SPLIT),
> >>>> +	DRM_FORMAT_MOD_LINEAR,
> >>>> +	DRM_FORMAT_MOD_INVALID,
> >>>> +};
> >>>> +
> >>>> +static const uint64_t format_modifiers_default[] = {
> >>>> +	DRM_FORMAT_MOD_LINEAR,
> >>>> +	DRM_FORMAT_MOD_INVALID,
> >>>> +};
> >>>> +
> >>>>  int meson_plane_create(struct meson_drm *priv)
> >>>>  {
> >>>>  	struct meson_plane *meson_plane;
> >>>>  	struct drm_plane *plane;
> >>>> +	const uint64_t *format_modifiers = format_modifiers_default;
> >>>>  
> >>>>  	meson_plane = devm_kzalloc(priv->drm->dev, sizeof(*meson_plane),
> >>>>  				   GFP_KERNEL);
> >>>> @@ -377,11 +525,16 @@ int meson_plane_create(struct meson_drm *priv)
> >>>>  	meson_plane->priv = priv;
> >>>>  	plane = &meson_plane->base;
> >>>>  
> >>>> +	if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_GXM))
> >>>> +		format_modifiers = format_modifiers_afbc_gxm;
> >>>> +	else if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_G12A))
> >>>> +		format_modifiers = format_modifiers_afbc_g12a;
> >>>> +
> >>>>  	drm_universal_plane_init(priv->drm, plane, 0xFF,
> >>>>  				 &meson_plane_funcs,
> >>>>  				 supported_drm_formats,
> >>>>  				 ARRAY_SIZE(supported_drm_formats),
> >>>> -				 NULL,
> >>>> +				 format_modifiers,
> >>>>  				 DRM_PLANE_TYPE_PRIMARY, "meson_primary_plane");
> >>>>  
> >>>>  	drm_plane_helper_add(plane, &meson_plane_helper_funcs);
> >>>> -- 
> >>>> 2.22.0
> >>
> >> _______________________________________________
> >> dri-devel mailing list
> >> dri-devel@xxxxxxxxxxxxxxxxxxxxx
> >> https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/dri-devel