Re: [PATCH 01/14] staging: vc04_services: Add new vc-sm-cma driver

Laurent Pinchart <laurent.pinchart@xxxxxxxxxxxxxxxx> · Tue, 22 Nov 2022 01:04:40 +0200

Hi Umang (and Dave),

Thank you for the patch.

On Tue, Nov 22, 2022 at 03:17:09AM +0530, Umang Jain wrote:
> From: Dave Stevenson <dave.stevenson@xxxxxxxxxxxxxxx>
> 
> Add Broadcom VideoCore Shared Memory support.
> 
> This new driver allows contiguous memory blocks to be imported
> into the VideoCore VPU memory map, and manages the lifetime of
> those objects, only releasing the source dmabuf once the VPU has
> confirmed it has finished with it.
> 
> Signed-off-by: Dave Stevenson <dave.stevenson@xxxxxxxxxxxxxxx>
> Signed-off-by: Umang Jain <umang.jain@xxxxxxxxxxxxxxxx>
> ---
>  drivers/staging/vc04_services/Kconfig         |   2 +
>  drivers/staging/vc04_services/Makefile        |   1 +
>  .../staging/vc04_services/vc-sm-cma/Kconfig   |  10 +
>  .../staging/vc04_services/vc-sm-cma/Makefile  |  12 +
>  .../staging/vc04_services/vc-sm-cma/vc_sm.c   | 801 ++++++++++++++++++
>  .../staging/vc04_services/vc-sm-cma/vc_sm.h   |  54 ++
>  .../vc04_services/vc-sm-cma/vc_sm_cma_vchi.c  | 507 +++++++++++
>  .../vc04_services/vc-sm-cma/vc_sm_cma_vchi.h  |  63 ++
>  .../vc04_services/vc-sm-cma/vc_sm_defs.h      | 187 ++++
>  .../vc04_services/vc-sm-cma/vc_sm_knl.h       |  28 +
>  10 files changed, 1665 insertions(+)
>  create mode 100644 drivers/staging/vc04_services/vc-sm-cma/Kconfig
>  create mode 100644 drivers/staging/vc04_services/vc-sm-cma/Makefile
>  create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm.c
>  create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm.h
>  create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.c
>  create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.h
>  create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm_defs.h
>  create mode 100644 drivers/staging/vc04_services/vc-sm-cma/vc_sm_knl.h
> 
> diff --git a/drivers/staging/vc04_services/Kconfig b/drivers/staging/vc04_services/Kconfig
> index 31e58c9d1a11..6c0e77d64376 100644
> --- a/drivers/staging/vc04_services/Kconfig
> +++ b/drivers/staging/vc04_services/Kconfig
> @@ -46,5 +46,7 @@ source "drivers/staging/vc04_services/bcm2835-camera/Kconfig"
>  
>  source "drivers/staging/vc04_services/vchiq-mmal/Kconfig"
>  
> +source "drivers/staging/vc04_services/vc-sm-cma/Kconfig"
> +
>  endif
>  
> diff --git a/drivers/staging/vc04_services/Makefile b/drivers/staging/vc04_services/Makefile
> index 1fd191e2e2a5..01089f369fb4 100644
> --- a/drivers/staging/vc04_services/Makefile
> +++ b/drivers/staging/vc04_services/Makefile
> @@ -14,6 +14,7 @@ endif
>  obj-$(CONFIG_SND_BCM2835)		+= bcm2835-audio/
>  obj-$(CONFIG_VIDEO_BCM2835)		+= bcm2835-camera/
>  obj-$(CONFIG_BCM2835_VCHIQ_MMAL)	+= vchiq-mmal/
> +obj-$(CONFIG_BCM_VC_SM_CMA)		+= vc-sm-cma/
>  
>  ccflags-y += -I $(srctree)/$(src)/include
>  
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/Kconfig b/drivers/staging/vc04_services/vc-sm-cma/Kconfig
> new file mode 100644
> index 000000000000..bbd296f5826b
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/Kconfig
> @@ -0,0 +1,10 @@
> +config BCM_VC_SM_CMA
> +	tristate "VideoCore Shared Memory (CMA) driver"
> +	depends on BCM2835_VCHIQ
> +	select RBTREE

I don't see an RBTREE config option in mainline.

> +	select DMA_SHARED_BUFFER
> +	help
> +	  Say Y here to enable the shared memory interface that
> +	  supports sharing dmabufs with VideoCore.
> +	  This operates over the VCHIQ interface to a service
> +	  running on VideoCore.
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/Makefile b/drivers/staging/vc04_services/vc-sm-cma/Makefile
> new file mode 100644
> index 000000000000..c92a5775c62e
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/Makefile
> @@ -0,0 +1,12 @@
> +ccflags-y += \
> +	-I$(srctree)/$(src)/../ \
> +	-I$(srctree)/$(src)/../interface/vchiq_arm\

Missing space before \

> +	-I$(srctree)/$(src)/../include
> +
> +ccflags-y += \
> +	-D__VCCOREVER__=0

Is this needed ?

> +
> +vc-sm-cma-$(CONFIG_BCM_VC_SM_CMA) := \
> +	vc_sm.o vc_sm_cma_vchi.o
> +
> +obj-$(CONFIG_BCM_VC_SM_CMA) += vc-sm-cma.o
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm.c b/drivers/staging/vc04_services/vc-sm-cma/vc_sm.c
> new file mode 100644
> index 000000000000..7fe81d259c7b
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm.c
> @@ -0,0 +1,801 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * VideoCore Shared Memory driver using CMA.
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + * Dave Stevenson <dave.stevenson@xxxxxxxxxxxxxxx>
> + *
> + * Based on vmcs_sm driver from Broadcom Corporation for some API,
> + * and taking some code for buffer allocation and dmabuf handling from
> + * videobuf2.
> + *
> + * This driver handles taking a dmabuf, mapping it into the VPU address space,
> + * and returning a handle that the VPU can use.
> + */
> +
> +/* ---- Include Files ----------------------------------------------------- */

I'd drop this comment (same below), it feels a bit alien.

> +#include <linux/debugfs.h>
> +#include <linux/dma-buf.h>
> +#include <linux/errno.h>
> +#include <linux/miscdevice.h>
> +#include <linux/platform_device.h>
> +#include <linux/syscalls.h>
> +
> +#include "vchiq_connected.h"
> +#include "vc_sm_cma_vchi.h"
> +
> +#include "vc_sm.h"
> +#include "vc_sm_knl.h"
> +
> +MODULE_IMPORT_NS(DMA_BUF);
> +
> +/* ---- Private Constants and Types --------------------------------------- */
> +
> +#define VC_SM_RESOURCE_NAME_DEFAULT       "sm-host-resource"
> +
> +#define VC_SM_DIR_ROOT_NAME	"vcsm-cma"
> +#define VC_SM_STATE		"state"
> +
> +/* Private file data associated with each opened device. */
> +struct vc_sm_privdata_t {
> +};
> +
> +typedef int (*VC_SM_SHOW) (struct seq_file *s, void *v);
> +struct sm_pde_t {

Should this structure (and the other ones below) have a vc_ prefix for
consistency ? Also, drop the _t suffix.

> +	VC_SM_SHOW show;          /* Debug fs function hookup. */
> +	struct dentry *dir_entry; /* Debug fs directory entry. */
> +	void *priv_data;          /* Private data */

Never used.

> +};
> +
> +/* Global state information. */
> +struct sm_state_t {
> +	struct platform_device *pdev;
> +
> +	struct miscdevice misc_dev;
> +
> +	struct sm_instance *sm_handle;	/* Handle for videocore service. */
> +
> +	spinlock_t kernelid_map_lock;	/* Spinlock protecting kernelid_map */
> +	struct idr kernelid_map;
> +
> +	struct mutex map_lock;          /* Global map lock. */
> +	struct list_head buffer_list;	/* List of buffer. */
> +
> +	struct dentry *dir_root;	/* Debug fs entries root. */
> +	struct sm_pde_t dir_state;	/* Debug fs entries state sub-tree. */
> +
> +	bool require_released_callback;	/* VPU will send a released msg when it
> +					 * has finished with a resource.
> +					 */
> +	/* State for transactions */
> +	int restart_sys;                /* Tracks restart on interrupt. */
> +	enum vc_sm_msg_type int_action; /* Interrupted action. */

Both of these are set but never used. Either there's dead code below, or
something is wrong.

> +
> +	u32 int_trans_id;		/* Interrupted transaction. */
> +	struct vchiq_instance *vchiq_instance;
> +};
> +
> +struct vc_sm_dma_buf_attachment {

Never used.

> +	struct device *dev;
> +	struct sg_table sg_table;
> +	struct list_head list;
> +	enum dma_data_direction	dma_dir;
> +};
> +
> +/* ---- Private Variables ----------------------------------------------- */
> +
> +static struct sm_state_t *sm_state;
> +static int sm_inited;

Can we avoid global variables please ?

> +
> +/* ---- Private Function Prototypes -------------------------------------- */
> +
> +/* ---- Private Functions ------------------------------------------------ */

If you want to split the driver in sections (which I personally like for
large files), I would group functions by purpose and add a section
header for each group.

> +
> +static int get_kernel_id(struct vc_sm_buffer *buffer)
> +{
> +	int handle;
> +
> +	spin_lock(&sm_state->kernelid_map_lock);
> +	handle = idr_alloc(&sm_state->kernelid_map, buffer, 0, 0, GFP_KERNEL);
> +	spin_unlock(&sm_state->kernelid_map_lock);
> +
> +	return handle;
> +}
> +
> +static struct vc_sm_buffer *lookup_kernel_id(int handle)
> +{
> +	return idr_find(&sm_state->kernelid_map, handle);
> +}
> +
> +static void free_kernel_id(int handle)
> +{
> +	spin_lock(&sm_state->kernelid_map_lock);
> +	idr_remove(&sm_state->kernelid_map, handle);
> +	spin_unlock(&sm_state->kernelid_map_lock);
> +}

For instance those three functions could be in the IDR section. I would
probably inline them in their respective caller though, but if you wait
to keep them, you should give them better names. vc_sm_cma_idr_ sounds
like a good prefix, the functions could be vc_sm_cma_idr_alloc(),
vc_sm_cma_idr_find() and vc_sm_cma_idr_remove(). I would then move the
IDR init and cleanup code to two separate functions too.

> +
> +static int vc_sm_cma_seq_file_show(struct seq_file *s, void *v)
> +{
> +	struct sm_pde_t *sm_pde;
> +
> +	sm_pde = (struct sm_pde_t *)(s->private);
> +
> +	if (sm_pde && sm_pde->show)
> +		sm_pde->show(s, v);

The only show handler is vc_sm_cma_global_state_show(), you can simplify
all this by dropping this function and passing
vc_sm_cma_global_state_show to single_open(). The show field in sm_pde_t
can also be dropped. You can actually drop the sm_pde_t structure and
embed the dir_entry field directly in sm_state_t.

> +
> +	return 0;
> +}
> +
> +static int vc_sm_cma_single_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, vc_sm_cma_seq_file_show, inode->i_private);
> +}
> +
> +static const struct file_operations vc_sm_cma_debug_fs_fops = {
> +	.open = vc_sm_cma_single_open,
> +	.read = seq_read,
> +	.llseek = seq_lseek,
> +	.release = single_release,
> +};
> +
> +static int vc_sm_cma_global_state_show(struct seq_file *s, void *v)
> +{
> +	struct vc_sm_buffer *resource = NULL;
> +	int resource_count = 0;
> +
> +	if (!sm_state)
> +		return 0;
> +
> +	seq_printf(s, "\nVC-ServiceHandle     %p\n", sm_state->sm_handle);
> +
> +	/* Log all applicable mapping(s). */
> +
> +	mutex_lock(&sm_state->map_lock);
> +	seq_puts(s, "\nResources\n");
> +	if (!list_empty(&sm_state->buffer_list)) {
> +		list_for_each_entry(resource, &sm_state->buffer_list,
> +				    global_buffer_list) {
> +			resource_count++;
> +
> +			seq_printf(s, "\nResource                %p\n",
> +				   resource);
> +			seq_printf(s, "           NAME         %s\n",
> +				   resource->name);
> +			seq_printf(s, "           SIZE         %zu\n",
> +				   resource->size);
> +			seq_printf(s, "           DMABUF       %p\n",
> +				   resource->dma_buf);
> +			seq_printf(s, "           IMPORTED_DMABUF %p\n",
> +				   resource->imported_dma_buf);
> +			seq_printf(s, "           ATTACH       %p\n",
> +				   resource->attach);
> +			seq_printf(s, "           SGT          %p\n",
> +				   resource->sgt);
> +			seq_printf(s, "           DMA_ADDR     %pad\n",
> +				   &resource->dma_addr);
> +			seq_printf(s, "           VC_HANDLE     %08x\n",
> +				   resource->vc_handle);
> +			seq_printf(s, "           VC_MAPPING    %d\n",
> +				   resource->vpu_state);
> +		}
> +	}
> +	seq_printf(s, "\n\nTotal resource count:   %d\n\n", resource_count);
> +
> +	mutex_unlock(&sm_state->map_lock);
> +
> +	return 0;
> +}
> +
> +/*
> + * Adds a buffer to the private data list which tracks all the allocated
> + * data.
> + */
> +static void vc_sm_add_resource(struct vc_sm_buffer *buffer)
> +{
> +	mutex_lock(&sm_state->map_lock);
> +	list_add(&buffer->global_buffer_list, &sm_state->buffer_list);
> +	mutex_unlock(&sm_state->map_lock);
> +
> +	pr_debug("[%s]: added buffer %p (name %s, size %zu)\n",
> +		 __func__, buffer, buffer->name, buffer->size);

Let's use dev_* instead of pr_* where possible.

> +}
> +
> +/*
> + * Cleans up imported dmabuf.
> + * Should be called with mutex held.
> + */
> +static void vc_sm_clean_up_dmabuf(struct vc_sm_buffer *buffer)
> +{
> +	/* Handle cleaning up imported dmabufs */
> +	if (buffer->sgt) {
> +		dma_buf_unmap_attachment(buffer->attach,
> +					 buffer->sgt,
> +					 DMA_BIDIRECTIONAL);
> +		buffer->sgt = NULL;
> +	}
> +	if (buffer->attach) {
> +		dma_buf_detach(buffer->dma_buf, buffer->attach);
> +		buffer->attach = NULL;
> +	}
> +}
> +
> +/*
> + * Instructs VPU to decrement the refcount on a buffer.
> + */
> +static void vc_sm_vpu_free(struct vc_sm_buffer *buffer)
> +{
> +	if (buffer->vc_handle && buffer->vpu_state == VPU_MAPPED) {
> +		struct vc_sm_free_t free = { buffer->vc_handle, 0 };
> +		int status = vc_sm_cma_vchi_free(sm_state->sm_handle, &free,
> +					     &sm_state->int_trans_id);
> +		if (status != 0 && status != -EINTR) {
> +			pr_err("[%s]: failed to free memory on videocore (status: %u, trans_id: %u)\n",
> +			       __func__, status, sm_state->int_trans_id);
> +		}
> +
> +		if (sm_state->require_released_callback) {
> +			/* Need to wait for the VPU to confirm the free. */
> +
> +			/* Retain a reference on this until the VPU has
> +			 * released it
> +			 */
> +			buffer->vpu_state = VPU_UNMAPPING;
> +		} else {
> +			buffer->vpu_state = VPU_NOT_MAPPED;
> +			buffer->vc_handle = 0;
> +		}
> +	}
> +}
> +
> +/*
> + * Release an allocation.
> + * All refcounting is done via the dma buf object.
> + *
> + * Must be called with the mutex held. The function will either release the
> + * mutex (if defering the release) or destroy it. The caller must therefore not
> + * reuse the buffer on return.
> + */
> +static void vc_sm_release_resource(struct vc_sm_buffer *buffer)
> +{
> +	pr_debug("[%s]: buffer %p (name %s, size %zu)\n",
> +		 __func__, buffer, buffer->name, buffer->size);
> +
> +	if (buffer->vc_handle) {
> +		/* We've sent the unmap request but not had the response. */
> +		pr_debug("[%s]: Waiting for VPU unmap response on %p\n",
> +			 __func__, buffer);
> +		goto defer;
> +	}
> +	if (buffer->in_use) {
> +		/* dmabuf still in use - we await the release */
> +		pr_debug("[%s]: buffer %p is still in use\n", __func__, buffer);
> +		goto defer;
> +	}
> +
> +	/* Release the allocation */
> +	if (buffer->imported_dma_buf)
> +		dma_buf_put(buffer->imported_dma_buf);
> +	else
> +		pr_err("%s: Imported dmabuf already been put for buf %p\n",
> +		       __func__, buffer);
> +	buffer->imported_dma_buf = NULL;
> +
> +	/* Free our buffer. Start by removing it from the list */
> +	mutex_lock(&sm_state->map_lock);
> +	list_del(&buffer->global_buffer_list);
> +	mutex_unlock(&sm_state->map_lock);
> +
> +	pr_debug("%s: Release our allocation - done\n", __func__);
> +	mutex_unlock(&buffer->lock);
> +
> +	mutex_destroy(&buffer->lock);
> +
> +	kfree(buffer);
> +	return;
> +
> +defer:
> +	mutex_unlock(&buffer->lock);
> +}
> +
> +/* Dma_buf operations for chaining through to an imported dma_buf */
> +
> +static void vc_sm_dma_buf_release(struct dma_buf *dmabuf)
> +{
> +	struct vc_sm_buffer *buffer;
> +
> +	if (!dmabuf)
> +		return;
> +
> +	buffer = (struct vc_sm_buffer *)dmabuf->priv;
> +
> +	mutex_lock(&buffer->lock);
> +
> +	pr_debug("%s dmabuf %p, buffer %p\n", __func__, dmabuf, buffer);
> +
> +	buffer->in_use = 0;
> +
> +	/* Unmap on the VPU */
> +	vc_sm_vpu_free(buffer);
> +	pr_debug("%s vpu_free done\n", __func__);
> +
> +	/* Unmap our dma_buf object (the vc_sm_buffer remains until released
> +	 * on the VPU).
> +	 */
> +	vc_sm_clean_up_dmabuf(buffer);
> +	pr_debug("%s clean_up dmabuf done\n", __func__);
> +
> +	/* buffer->lock will be destroyed by vc_sm_release_resource if finished
> +	 * with, otherwise unlocked. Do NOT unlock here.
> +	 */
> +	vc_sm_release_resource(buffer);
> +	pr_debug("%s done\n", __func__);
> +}
> +
> +static
> +int vc_sm_import_dma_buf_attach(struct dma_buf *dmabuf,
> +				struct dma_buf_attachment *attachment)
> +{
> +	struct vc_sm_buffer *buf = dmabuf->priv;
> +
> +	return buf->imported_dma_buf->ops->attach(buf->imported_dma_buf,
> +						attachment);
> +}
> +
> +static
> +void vc_sm_import_dma_buf_detatch(struct dma_buf *dmabuf,
> +				  struct dma_buf_attachment *attachment)
> +{
> +	struct vc_sm_buffer *buf = dmabuf->priv;
> +
> +	buf->imported_dma_buf->ops->detach(buf->imported_dma_buf, attachment);
> +}
> +
> +static
> +struct sg_table *vc_sm_import_map_dma_buf(struct dma_buf_attachment *attachment,
> +					  enum dma_data_direction direction)
> +{
> +	struct vc_sm_buffer *buf = attachment->dmabuf->priv;
> +
> +	return buf->imported_dma_buf->ops->map_dma_buf(attachment,
> +						     direction);
> +}
> +
> +static
> +void vc_sm_import_unmap_dma_buf(struct dma_buf_attachment *attachment,
> +				struct sg_table *table,
> +				enum dma_data_direction direction)
> +{
> +	struct vc_sm_buffer *buf = attachment->dmabuf->priv;
> +
> +	buf->imported_dma_buf->ops->unmap_dma_buf(attachment, table, direction);
> +}
> +
> +static
> +int vc_sm_import_dmabuf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> +{
> +	struct vc_sm_buffer *buf = dmabuf->priv;
> +
> +	pr_debug("%s: mmap dma_buf %p, buf %p, imported db %p\n", __func__,
> +		 dmabuf, buf, buf->imported_dma_buf);
> +
> +	return buf->imported_dma_buf->ops->mmap(buf->imported_dma_buf, vma);
> +}
> +
> +static
> +int vc_sm_import_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
> +					  enum dma_data_direction direction)
> +{
> +	struct vc_sm_buffer *buf = dmabuf->priv;
> +
> +	return buf->imported_dma_buf->ops->begin_cpu_access(buf->imported_dma_buf,
> +							  direction);
> +}
> +
> +static
> +int vc_sm_import_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
> +					enum dma_data_direction direction)
> +{
> +	struct vc_sm_buffer *buf = dmabuf->priv;
> +
> +	return buf->imported_dma_buf->ops->end_cpu_access(buf->imported_dma_buf,
> +							  direction);
> +}
> +
> +static const struct dma_buf_ops dma_buf_import_ops = {
> +	.map_dma_buf = vc_sm_import_map_dma_buf,
> +	.unmap_dma_buf = vc_sm_import_unmap_dma_buf,
> +	.mmap = vc_sm_import_dmabuf_mmap,
> +	.release = vc_sm_dma_buf_release,
> +	.attach = vc_sm_import_dma_buf_attach,
> +	.detach = vc_sm_import_dma_buf_detatch,
> +	.begin_cpu_access = vc_sm_import_dma_buf_begin_cpu_access,
> +	.end_cpu_access = vc_sm_import_dma_buf_end_cpu_access,
> +};
> +
> +/* Import a dma_buf to be shared with VC. */
> +int
> +vc_sm_cma_import_dmabuf_internal(struct sm_state_t *state,
> +				 struct dma_buf *dma_buf,
> +				 int fd,
> +				 struct dma_buf **imported_buf)

This can be static.

> +{
> +	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> +	struct vc_sm_buffer *buffer = NULL;
> +	struct vc_sm_import import = { };
> +	struct vc_sm_import_result result = { };
> +	struct dma_buf_attachment *attach = NULL;
> +	struct sg_table *sgt = NULL;
> +	dma_addr_t dma_addr;
> +	int ret = 0;
> +	int status;
> +
> +	/* Setup our allocation parameters */
> +	pr_debug("%s: importing dma_buf %p/fd %d\n", __func__, dma_buf, fd);
> +
> +	if (fd < 0)
> +		get_dma_buf(dma_buf);
> +	else
> +		dma_buf = dma_buf_get(fd);

This function is always called with fd == -1. There seems to be quite a
bit of dead code in this driver, could you double-check everything and
trim it down ?

> +
> +	if (!dma_buf)
> +		return -EINVAL;
> +
> +	attach = dma_buf_attach(dma_buf, &sm_state->pdev->dev);
> +	if (IS_ERR(attach)) {
> +		ret = PTR_ERR(attach);
> +		goto error;
> +	}
> +
> +	sgt = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL);
> +	if (IS_ERR(sgt)) {
> +		ret = PTR_ERR(sgt);
> +		goto error;
> +	}
> +
> +	/* Verify that the address block is contiguous */
> +	if (sgt->nents != 1) {
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	/* Allocate local buffer to track this allocation. */
> +	buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
> +	if (!buffer) {
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	import.type = VC_SM_ALLOC_NON_CACHED;
> +	dma_addr = sg_dma_address(sgt->sgl);
> +	import.addr = (u32)dma_addr;
> +	if ((import.addr & 0xC0000000) != 0xC0000000) {
> +		pr_err("%s: Expecting an uncached alias for dma_addr %pad\n",
> +		       __func__, &dma_addr);
> +		import.addr |= 0xC0000000;
> +	}
> +	import.size = sg_dma_len(sgt->sgl);
> +	import.allocator = current->tgid;
> +	import.kernel_id = get_kernel_id(buffer);
> +
> +	memcpy(import.name, VC_SM_RESOURCE_NAME_DEFAULT,
> +	       sizeof(VC_SM_RESOURCE_NAME_DEFAULT));
> +
> +	pr_debug("[%s]: attempt to import \"%s\" data - type %u, addr %pad, size %u.\n",
> +		 __func__, import.name, import.type, &dma_addr, import.size);
> +
> +	/* Allocate the videocore buffer. */
> +	status = vc_sm_cma_vchi_import(sm_state->sm_handle, &import, &result,
> +				       &sm_state->int_trans_id);
> +	if (status == -EINTR) {
> +		pr_debug("[%s]: requesting import memory action restart (trans_id: %u)\n",
> +			 __func__, sm_state->int_trans_id);
> +		ret = -ERESTARTSYS;
> +		sm_state->restart_sys = -EINTR;
> +		sm_state->int_action = VC_SM_MSG_TYPE_IMPORT;
> +		goto error;
> +	} else if (status || !result.res_handle) {
> +		pr_debug("[%s]: failed to import memory on videocore (status: %u, trans_id: %u)\n",
> +			 __func__, status, sm_state->int_trans_id);
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	mutex_init(&buffer->lock);
> +	INIT_LIST_HEAD(&buffer->attachments);
> +	memcpy(buffer->name, import.name,
> +	       min(sizeof(buffer->name), sizeof(import.name) - 1));
> +
> +	/* Keep track of the buffer we created. */
> +	buffer->vc_handle = result.res_handle;
> +	buffer->size = import.size;
> +	buffer->vpu_state = VPU_MAPPED;
> +
> +	buffer->imported_dma_buf = dma_buf;
> +
> +	buffer->attach = attach;
> +	buffer->sgt = sgt;
> +	buffer->dma_addr = dma_addr;
> +	buffer->in_use = 1;
> +	buffer->kernel_id = import.kernel_id;
> +
> +	/*
> +	 * We're done - we need to export a new dmabuf chaining through most
> +	 * functions, but enabling us to release our own internal references
> +	 * here.
> +	 */

Why do we need to export a new dmabuf ?

> +	exp_info.ops = &dma_buf_import_ops;
> +	exp_info.size = import.size;
> +	exp_info.flags = O_RDWR;
> +	exp_info.priv = buffer;
> +
> +	buffer->dma_buf = dma_buf_export(&exp_info);
> +	if (IS_ERR(buffer->dma_buf)) {
> +		ret = PTR_ERR(buffer->dma_buf);
> +		goto error;
> +	}
> +
> +	vc_sm_add_resource(buffer);
> +
> +	*imported_buf = buffer->dma_buf;
> +
> +	return 0;
> +
> +error:
> +	if (result.res_handle) {
> +		struct vc_sm_free_t free = { result.res_handle, 0 };
> +
> +		vc_sm_cma_vchi_free(sm_state->sm_handle, &free,
> +				    &sm_state->int_trans_id);
> +	}
> +	free_kernel_id(import.kernel_id);
> +	kfree(buffer);
> +	if (sgt)
> +		dma_buf_unmap_attachment(attach, sgt, DMA_BIDIRECTIONAL);
> +	if (attach)
> +		dma_buf_detach(dma_buf, attach);
> +	dma_buf_put(dma_buf);
> +	return ret;
> +}
> +
> +static void
> +vc_sm_vpu_event(struct sm_instance *instance, struct vc_sm_result_t *reply,
> +		int reply_len)
> +{
> +	switch (reply->trans_id & ~0x80000000) {
> +	case VC_SM_MSG_TYPE_CLIENT_VERSION:
> +	{
> +		/* Acknowledge that the firmware supports the version command */
> +		pr_debug("%s: firmware acked version msg. Require release cb\n",
> +			 __func__);
> +		sm_state->require_released_callback = true;
> +	}
> +	break;

No need for braces.

> +	case VC_SM_MSG_TYPE_RELEASED:
> +	{
> +		struct vc_sm_released *release = (struct vc_sm_released *)reply;
> +		struct vc_sm_buffer *buffer =
> +					lookup_kernel_id(release->kernel_id);
> +		if (!buffer) {
> +			pr_err("%s: VC released a buffer that is already released, kernel_id %d\n",
> +			       __func__, release->kernel_id);
> +			break;
> +		}
> +		mutex_lock(&buffer->lock);
> +
> +		pr_debug("%s: Released addr %08x, size %u, id %08x, mem_handle %08x\n",
> +			 __func__, release->addr, release->size,
> +			 release->kernel_id, release->vc_handle);
> +
> +		buffer->vc_handle = 0;
> +		buffer->vpu_state = VPU_NOT_MAPPED;
> +		free_kernel_id(release->kernel_id);
> +
> +		vc_sm_release_resource(buffer);
> +	}
> +	break;
> +	default:
> +		pr_err("%s: Unknown vpu cmd %x\n", __func__, reply->trans_id);
> +		break;
> +	}
> +}
> +
> +/* Driver load/unload functions */
> +/* Videocore connected.  */
> +static void vc_sm_connected_init(void)
> +{
> +	int ret;
> +	struct vc_sm_version version;
> +	struct vc_sm_result_t version_result;
> +
> +	pr_info("[%s]: start\n", __func__);
> +
> +	/*
> +	 * Initialize and create a VCHI connection for the shared memory service
> +	 * running on videocore.
> +	 */
> +	ret = vchiq_initialise(&sm_state->vchiq_instance);
> +	if (ret) {
> +		pr_err("[%s]: failed to initialise VCHI instance (ret=%d)\n",
> +		       __func__, ret);
> +
> +		return;
> +	}
> +
> +	ret = vchiq_connect(sm_state->vchiq_instance);
> +	if (ret) {
> +		pr_err("[%s]: failed to connect VCHI instance (ret=%d)\n",
> +		       __func__, ret);
> +
> +		return;
> +	}
> +
> +	/* Initialize an instance of the shared memory service. */
> +	sm_state->sm_handle = vc_sm_cma_vchi_init(sm_state->vchiq_instance, 1,
> +						  vc_sm_vpu_event);
> +	if (!sm_state->sm_handle) {
> +		pr_err("[%s]: failed to initialize shared memory service\n",
> +		       __func__);
> +
> +		return;
> +	}
> +
> +	/* Create a debug fs directory entry (root). */
> +	sm_state->dir_root = debugfs_create_dir(VC_SM_DIR_ROOT_NAME, NULL);
> +
> +	sm_state->dir_state.show = &vc_sm_cma_global_state_show;
> +	sm_state->dir_state.dir_entry =
> +		debugfs_create_file(VC_SM_STATE, 0444, sm_state->dir_root,
> +				    &sm_state->dir_state,
> +				    &vc_sm_cma_debug_fs_fops);

Move this to a separate debugfs init function, grouped with the rest of
the debugfs code. debugfs is conditioned by CONFIG_DEBUG_FS, so you'll
need conditional compilation. You will also need to handle cleanup, you
never destroy the file and directory.

> +
> +	INIT_LIST_HEAD(&sm_state->buffer_list);
> +
> +	version.version = 2;
> +	ret = vc_sm_cma_vchi_client_version(sm_state->sm_handle, &version,
> +					    &version_result,
> +					    &sm_state->int_trans_id);
> +	if (ret) {
> +		pr_err("[%s]: Failed to send version request %d\n", __func__,
> +		       ret);
> +	}
> +
> +	/* Done! */
> +	sm_inited = 1;
> +	pr_info("[%s]: installed successfully\n", __func__);
> +}
> +
> +/* Driver loading. */
> +static int bcm2835_vc_sm_cma_probe(struct platform_device *pdev)
> +{
> +	pr_info("%s: Videocore shared memory driver\n", __func__);

Drop this, it only clutters the kernel log.

> +
> +	sm_state = devm_kzalloc(&pdev->dev, sizeof(*sm_state), GFP_KERNEL);
> +	if (!sm_state)
> +		return -ENOMEM;
> +	sm_state->pdev = pdev;
> +	mutex_init(&sm_state->map_lock);
> +
> +	spin_lock_init(&sm_state->kernelid_map_lock);
> +	idr_init_base(&sm_state->kernelid_map, 1);
> +
> +	pdev->dev.dma_parms = devm_kzalloc(&pdev->dev,
> +					   sizeof(*pdev->dev.dma_parms),
> +					   GFP_KERNEL);
> +	/* dma_set_max_seg_size checks if dma_parms is NULL. */
> +	dma_set_max_seg_size(&pdev->dev, 0x3FFFFFFF);
> +
> +	vchiq_add_connected_callback(vc_sm_connected_init);
> +	return 0;
> +}
> +
> +/* Driver unloading. */
> +static int bcm2835_vc_sm_cma_remove(struct platform_device *pdev)
> +{
> +	pr_debug("[%s]: start\n", __func__);

You can drop this message and the one at the end of the function.

> +	if (sm_inited) {
> +		misc_deregister(&sm_state->misc_dev);

There's no misc_register() call, something seems wrong. Probably
leftover code ?

> +
> +		/* Remove all proc entries. */
> +		debugfs_remove_recursive(sm_state->dir_root);
> +
> +		/* Stop the videocore shared memory service. */
> +		vc_sm_cma_vchi_stop(sm_state->vchiq_instance, &sm_state->sm_handle);
> +	}
> +
> +	if (sm_state) {

Drop the condition, sm_state can never be NULL if probe() succeeded, and
if it hasn't succeeded, then remove() will not be called.

> +		idr_destroy(&sm_state->kernelid_map);
> +
> +		/* Free the memory for the state structure. */
> +		mutex_destroy(&sm_state->map_lock);
> +	}
> +
> +	pr_debug("[%s]: end\n", __func__);
> +	return 0;
> +}
> +
> +/* Kernel API calls */
> +/* Get an internal resource handle mapped from the external one. */
> +int vc_sm_cma_int_handle(void *handle)
> +{
> +	struct dma_buf *dma_buf = (struct dma_buf *)handle;
> +	struct vc_sm_buffer *buf;
> +
> +	/* Validate we can work with this device. */
> +	if (!sm_state || !handle) {
> +		pr_err("[%s]: invalid input\n", __func__);
> +		return 0;
> +	}
> +
> +	buf = (struct vc_sm_buffer *)dma_buf->priv;
> +	return buf->vc_handle;
> +}
> +EXPORT_SYMBOL_GPL(vc_sm_cma_int_handle);
> +
> +/* Free a previously allocated shared memory handle and block. */
> +int vc_sm_cma_free(void *handle)
> +{
> +	struct dma_buf *dma_buf = (struct dma_buf *)handle;
> +
> +	/* Validate we can work with this device. */
> +	if (!sm_state || !handle) {
> +		pr_err("[%s]: invalid input\n", __func__);
> +		return -EPERM;

That's a weird error code for an invalid input.

> +	}
> +
> +	pr_debug("%s: handle %p/dmabuf %p\n", __func__, handle, dma_buf);
> +
> +	dma_buf_put(dma_buf);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(vc_sm_cma_free);
> +
> +/* Import a dmabuf to be shared with VC. */
> +int vc_sm_cma_import_dmabuf(struct dma_buf *src_dmabuf, void **handle)
> +{
> +	struct dma_buf *new_dma_buf;
> +	int ret;
> +
> +	/* Validate we can work with this device. */
> +	if (!sm_state || !src_dmabuf || !handle) {
> +		pr_err("[%s]: invalid input\n", __func__);
> +		return -EPERM;

Same here.

> +	}
> +
> +	ret = vc_sm_cma_import_dmabuf_internal(sm_state, src_dmabuf,
> +					       -1, &new_dma_buf);
> +
> +	if (!ret) {
> +		pr_debug("%s: imported to ptr %p\n", __func__, new_dma_buf);
> +
> +		/* Assign valid handle at this time.*/
> +		*handle = new_dma_buf;
> +	} else {
> +		/*
> +		 * succeeded in importing the dma_buf, but then
> +		 * failed to look it up again. How?
> +		 * Release the fd again.
> +		 */
> +		pr_err("%s: imported vc_sm_cma_get_buffer failed %d\n",
> +		       __func__, ret);
> +	}
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(vc_sm_cma_import_dmabuf);

Those three functions are a bit ill-placed between the probe/remove
functions and the platform_driver. I'd move them just above
vc_sm_vpu_event(), and order them as mentioned below in the header file.

> +
> +static struct platform_driver bcm2835_vcsm_cma_driver = {
> +	.probe = bcm2835_vc_sm_cma_probe,
> +	.remove = bcm2835_vc_sm_cma_remove,
> +	.driver = {
> +		   .name = "vcsm-cma",
> +		   .owner = THIS_MODULE,
> +		  },

Wrong indentation, should be

	.driver = {
		.name = "vcsm-cma",
		.owner = THIS_MODULE,
	},

> +};
> +
> +module_platform_driver(bcm2835_vcsm_cma_driver);
> +
> +MODULE_AUTHOR("Dave Stevenson");
> +MODULE_DESCRIPTION("VideoCore CMA Shared Memory Driver");
> +MODULE_LICENSE("GPL");
> +MODULE_ALIAS("platform:vcsm-cma");
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm.h b/drivers/staging/vc04_services/vc-sm-cma/vc_sm.h
> new file mode 100644
> index 000000000000..61e110ec414d
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * VideoCore Shared Memory driver using CMA.
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + *
> + */
> +
> +#ifndef VC_SM_H
> +#define VC_SM_H
> +
> +#define VC_SM_MAX_NAME_LEN 32
> +
> +enum vc_sm_vpu_mapping_state {
> +	VPU_NOT_MAPPED,
> +	VPU_MAPPED,
> +	VPU_UNMAPPING
> +};
> +
> +struct vc_sm_buffer {
> +	struct list_head global_buffer_list;	/* Global list of buffers. */
> +
> +	/* Index in the kernel_id idr so that we can find the
> +	 * mmal_msg_context again when servicing the VCHI reply.
> +	 */
> +	int kernel_id;
> +
> +	size_t size;
> +
> +	/* Lock over all the following state for this buffer */
> +	struct mutex lock;
> +	struct list_head attachments;
> +
> +	char name[VC_SM_MAX_NAME_LEN];
> +
> +	int in_use:1;	/* Kernel is still using this resource */
> +
> +	enum vc_sm_vpu_mapping_state vpu_state;
> +	u32 vc_handle;	/* VideoCore handle for this buffer */
> +
> +	/* DMABUF related fields */
> +	struct dma_buf *dma_buf;
> +	dma_addr_t dma_addr;
> +	void *cookie;
> +
> +	struct vc_sm_privdata_t *private;
> +
> +	struct dma_buf *imported_dma_buf;
> +	struct dma_buf_attachment *attach;
> +	struct sg_table *sgt;
> +};
> +
> +#endif
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.c b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.c
> new file mode 100644
> index 000000000000..c77ef0998a31
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.c
> @@ -0,0 +1,507 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * VideoCore Shared Memory CMA allocator
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + * Copyright 2011-2012 Broadcom Corporation.  All rights reserved.
> + *
> + * Based on vmcs_sm driver from Broadcom Corporation.
> + *
> + */
> +
> +/* ---- Include Files ----------------------------------------------------- */
> +#include <linux/kthread.h>
> +#include <linux/list.h>
> +#include <linux/semaphore.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +
> +#include "vc_sm_cma_vchi.h"
> +
> +#define VC_SM_VER  1
> +#define VC_SM_MIN_VER 0
> +
> +/* ---- Private Constants and Types -------------------------------------- */
> +
> +/* Command blocks come from a pool */
> +#define SM_MAX_NUM_CMD_RSP_BLKS 32
> +
> +/* The number of supported connections */
> +#define SM_MAX_NUM_CONNECTIONS 3
> +
> +struct sm_cmd_rsp_blk {
> +	struct list_head head;	/* To create lists */
> +	/* To be signaled when the response is there */
> +	struct completion cmplt;
> +
> +	u32 id;
> +	u16 length;
> +
> +	u8 msg[VC_SM_MAX_MSG_LEN];
> +
> +	uint32_t wait:1;
> +	uint32_t sent:1;
> +	uint32_t alloc:1;
> +
> +};
> +
> +struct sm_instance {
> +	u32 num_connections;
> +	unsigned int service_handle[SM_MAX_NUM_CONNECTIONS];
> +	struct task_struct *io_thread;
> +	struct completion io_cmplt;
> +
> +	vpu_event_cb vpu_event;
> +
> +	/* Mutex over the following lists */
> +	struct mutex lock;
> +	u32 trans_id;
> +	struct list_head cmd_list;
> +	struct list_head rsp_list;
> +	struct list_head dead_list;
> +
> +	struct sm_cmd_rsp_blk free_blk[SM_MAX_NUM_CMD_RSP_BLKS];
> +
> +	/* Mutex over the free_list */
> +	struct mutex free_lock;
> +	struct list_head free_list;
> +
> +	struct semaphore free_sema;
> +	struct vchiq_instance *vchiq_instance;
> +};
> +
> +/* ---- Private Variables ------------------------------------------------ */
> +
> +/* ---- Private Function Prototypes -------------------------------------- */
> +
> +/* ---- Private Functions ------------------------------------------------ */
> +static int
> +bcm2835_vchi_msg_queue(struct vchiq_instance *vchiq_instance, unsigned int handle,
> +		       void *data,
> +		       unsigned int size)
> +{
> +	return vchiq_queue_kernel_message(vchiq_instance, handle, data, size);
> +}
> +
> +static struct
> +sm_cmd_rsp_blk *vc_vchi_cmd_create(struct sm_instance *instance,
> +				   enum vc_sm_msg_type id, void *msg,
> +				   u32 size, int wait)
> +{
> +	struct sm_cmd_rsp_blk *blk;
> +	struct vc_sm_msg_hdr_t *hdr;
> +
> +	if (down_interruptible(&instance->free_sema)) {
> +		blk = kmalloc(sizeof(*blk), GFP_KERNEL);
> +		if (!blk)
> +			return NULL;
> +
> +		blk->alloc = 1;
> +		init_completion(&blk->cmplt);
> +	} else {
> +		mutex_lock(&instance->free_lock);
> +		blk =
> +		    list_first_entry(&instance->free_list,
> +				     struct sm_cmd_rsp_blk, head);
> +		list_del(&blk->head);
> +		mutex_unlock(&instance->free_lock);
> +	}
> +
> +	blk->sent = 0;
> +	blk->wait = wait;
> +	blk->length = sizeof(*hdr) + size;
> +
> +	hdr = (struct vc_sm_msg_hdr_t *)blk->msg;
> +	hdr->type = id;
> +	mutex_lock(&instance->lock);
> +	instance->trans_id++;
> +	/*
> +	 * Retain the top bit for identifying asynchronous events, or VPU cmds.
> +	 */
> +	instance->trans_id &= ~0x80000000;
> +	hdr->trans_id = instance->trans_id;
> +	blk->id = instance->trans_id;
> +	mutex_unlock(&instance->lock);
> +
> +	if (size)
> +		memcpy(hdr->body, msg, size);
> +
> +	return blk;
> +}
> +
> +static void
> +vc_vchi_cmd_delete(struct sm_instance *instance, struct sm_cmd_rsp_blk *blk)
> +{
> +	if (blk->alloc) {
> +		kfree(blk);
> +		return;
> +	}
> +
> +	mutex_lock(&instance->free_lock);
> +	list_add(&blk->head, &instance->free_list);
> +	mutex_unlock(&instance->free_lock);
> +	up(&instance->free_sema);
> +}
> +
> +static void vc_sm_cma_vchi_rx_ack(struct sm_instance *instance,
> +				  struct sm_cmd_rsp_blk *cmd,
> +				  struct vc_sm_result_t *reply,
> +				  u32 reply_len)
> +{
> +	mutex_lock(&instance->lock);
> +	list_for_each_entry(cmd,
> +			    &instance->rsp_list,
> +			    head) {
> +		if (cmd->id == reply->trans_id)
> +			break;
> +	}
> +	mutex_unlock(&instance->lock);
> +
> +	if (&cmd->head == &instance->rsp_list) {
> +		//pr_debug("%s: received response %u, throw away...",
> +		pr_err("%s: received response %u, throw away...",
> +		       __func__,
> +		       reply->trans_id);
> +	} else if (reply_len > sizeof(cmd->msg)) {
> +		pr_err("%s: reply too big (%u) %u, throw away...",
> +		       __func__, reply_len,
> +		     reply->trans_id);
> +	} else {
> +		memcpy(cmd->msg, reply,
> +		       reply_len);
> +		complete(&cmd->cmplt);
> +	}
> +}
> +
> +static int vc_sm_cma_vchi_videocore_io(void *arg)
> +{
> +	struct sm_instance *instance = arg;
> +	struct sm_cmd_rsp_blk *cmd = NULL, *cmd_tmp;
> +	struct vc_sm_result_t *reply;
> +	struct vchiq_header *header;
> +	s32 status;
> +	int svc_use = 1;
> +
> +	while (1) {
> +		if (svc_use)
> +			vchiq_release_service(instance->vchiq_instance,
> +					      instance->service_handle[0]);
> +		svc_use = 0;
> +
> +		if (wait_for_completion_interruptible(&instance->io_cmplt))
> +			continue;

Isn't it a bit overkill to use a thread, wouldn't a workqueue do ?

> +		vchiq_use_service(instance->vchiq_instance, instance->service_handle[0]);
> +		svc_use = 1;
> +
> +		do {
> +			/*
> +			 * Get new command and move it to response list
> +			 */
> +			mutex_lock(&instance->lock);
> +			if (list_empty(&instance->cmd_list)) {
> +				/* no more commands to process */
> +				mutex_unlock(&instance->lock);
> +				break;
> +			}
> +			cmd = list_first_entry(&instance->cmd_list,
> +					       struct sm_cmd_rsp_blk, head);
> +			list_move(&cmd->head, &instance->rsp_list);
> +			cmd->sent = 1;
> +			mutex_unlock(&instance->lock);
> +			/* Send the command */
> +			status =
> +				bcm2835_vchi_msg_queue(instance->vchiq_instance,
> +						       instance->service_handle[0],
> +						       cmd->msg, cmd->length);
> +			if (status) {
> +				pr_err("%s: failed to queue message (%d)",
> +				       __func__, status);
> +			}
> +
> +			/* If no reply is needed then we're done */
> +			if (!cmd->wait) {
> +				mutex_lock(&instance->lock);
> +				list_del(&cmd->head);
> +				mutex_unlock(&instance->lock);
> +				vc_vchi_cmd_delete(instance, cmd);
> +				continue;
> +			}
> +
> +			if (status) {
> +				complete(&cmd->cmplt);
> +				continue;
> +			}
> +
> +		} while (1);
> +
> +		while ((header = vchiq_msg_hold(instance->vchiq_instance,
> +						instance->service_handle[0]))) {
> +			reply = (struct vc_sm_result_t *)header->data;
> +			if (reply->trans_id & 0x80000000) {
> +				/* Async event or cmd from the VPU */
> +				if (instance->vpu_event)
> +					instance->vpu_event(instance, reply,
> +							    header->size);
> +			} else {
> +				vc_sm_cma_vchi_rx_ack(instance, cmd, reply,
> +						      header->size);
> +			}
> +
> +			vchiq_release_message(instance->vchiq_instance,
> +					      instance->service_handle[0],
> +					      header);
> +		}
> +
> +		/* Go through the dead list and free them */
> +		mutex_lock(&instance->lock);
> +		list_for_each_entry_safe(cmd, cmd_tmp, &instance->dead_list,
> +					 head) {
> +			list_del(&cmd->head);
> +			vc_vchi_cmd_delete(instance, cmd);
> +		}
> +		mutex_unlock(&instance->lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static enum vchiq_status vc_sm_cma_vchi_callback(struct vchiq_instance *vchiq_instance,
> +						 enum vchiq_reason reason,
> +						 struct vchiq_header *header,
> +						 unsigned int handle, void *userdata)
> +{
> +	struct sm_instance *instance = vchiq_get_service_userdata(vchiq_instance, handle);
> +
> +	switch (reason) {
> +	case VCHIQ_MESSAGE_AVAILABLE:
> +		vchiq_msg_queue_push(vchiq_instance, handle, header);
> +		complete(&instance->io_cmplt);
> +		break;
> +
> +	case VCHIQ_SERVICE_CLOSED:
> +		pr_info("%s: service CLOSED!!", __func__);
> +	default:
> +		break;
> +	}
> +
> +	return VCHIQ_SUCCESS;
> +}
> +
> +struct sm_instance *vc_sm_cma_vchi_init(struct vchiq_instance *vchiq_instance,
> +					unsigned int num_connections,
> +					vpu_event_cb vpu_event)
> +{
> +	u32 i;
> +	struct sm_instance *instance;
> +	int status;
> +
> +	pr_debug("%s: start", __func__);
> +
> +	if (num_connections > SM_MAX_NUM_CONNECTIONS) {
> +		pr_err("%s: unsupported number of connections %u (max=%u)",
> +		       __func__, num_connections, SM_MAX_NUM_CONNECTIONS);
> +
> +		goto err_null;
> +	}
> +	/* Allocate memory for this instance */
> +	instance = kzalloc(sizeof(*instance), GFP_KERNEL);
> +
> +	/* Misc initialisations */
> +	mutex_init(&instance->lock);
> +	init_completion(&instance->io_cmplt);
> +	INIT_LIST_HEAD(&instance->cmd_list);
> +	INIT_LIST_HEAD(&instance->rsp_list);
> +	INIT_LIST_HEAD(&instance->dead_list);
> +	INIT_LIST_HEAD(&instance->free_list);
> +	sema_init(&instance->free_sema, SM_MAX_NUM_CMD_RSP_BLKS);
> +	mutex_init(&instance->free_lock);
> +	for (i = 0; i < SM_MAX_NUM_CMD_RSP_BLKS; i++) {
> +		init_completion(&instance->free_blk[i].cmplt);
> +		list_add(&instance->free_blk[i].head, &instance->free_list);
> +	}
> +
> +	instance->vchiq_instance = vchiq_instance;
> +
> +	/* Open the VCHI service connections */
> +	instance->num_connections = num_connections;
> +	for (i = 0; i < num_connections; i++) {
> +		struct vchiq_service_params_kernel params = {
> +			.version = VC_SM_VER,
> +			.version_min = VC_SM_MIN_VER,
> +			.fourcc = VCHIQ_MAKE_FOURCC('S', 'M', 'E', 'M'),
> +			.callback = vc_sm_cma_vchi_callback,
> +			.userdata = instance,
> +		};
> +
> +		status = vchiq_open_service(vchiq_instance, &params,
> +					    &instance->service_handle[i]);
> +		if (status) {
> +			pr_err("%s: failed to open VCHI service (%d)",
> +			       __func__, status);
> +
> +			goto err_close_services;
> +		}
> +	}
> +	/* Create the thread which takes care of all io to/from videoocore. */
> +	instance->io_thread = kthread_create(&vc_sm_cma_vchi_videocore_io,
> +					     (void *)instance, "SMIO");
> +	if (!instance->io_thread) {
> +		pr_err("%s: failed to create SMIO thread", __func__);
> +
> +		goto err_close_services;
> +	}
> +	instance->vpu_event = vpu_event;
> +	set_user_nice(instance->io_thread, -10);
> +	wake_up_process(instance->io_thread);
> +
> +	pr_debug("%s: success - instance %p", __func__, instance);
> +	return instance;
> +
> +err_close_services:
> +	for (i = 0; i < instance->num_connections; i++) {
> +		if (instance->service_handle[i])
> +			vchiq_close_service(vchiq_instance, instance->service_handle[i]);
> +	}
> +	kfree(instance);
> +err_null:
> +	pr_debug("%s: FAILED", __func__);
> +	return NULL;
> +}
> +
> +int vc_sm_cma_vchi_stop(struct vchiq_instance *vchiq_instance, struct sm_instance **handle)
> +{
> +	struct sm_instance *instance;
> +	u32 i;
> +
> +	if (!handle) {
> +		pr_err("%s: invalid pointer to handle %p", __func__, handle);
> +		goto lock;
> +	}
> +
> +	if (!*handle) {
> +		pr_err("%s: invalid handle %p", __func__, *handle);
> +		goto lock;
> +	}
> +
> +	instance = *handle;
> +
> +	/* Close all VCHI service connections */
> +	for (i = 0; i < instance->num_connections; i++) {
> +		vchiq_use_service(vchiq_instance, instance->service_handle[i]);
> +		vchiq_close_service(vchiq_instance, instance->service_handle[i]);
> +	}
> +
> +	kfree(instance);
> +
> +	*handle = NULL;
> +	return 0;
> +
> +lock:
> +	return -EINVAL;
> +}
> +
> +static int vc_sm_cma_vchi_send_msg(struct sm_instance *handle,
> +				   enum vc_sm_msg_type msg_id, void *msg,
> +				   u32 msg_size, void *result, u32 result_size,
> +				   u32 *cur_trans_id, u8 wait_reply)
> +{
> +	int status = 0;
> +	struct sm_instance *instance = handle;
> +	struct sm_cmd_rsp_blk *cmd_blk;
> +
> +	if (!handle) {
> +		pr_err("%s: invalid handle", __func__);
> +		return -EINVAL;
> +	}
> +	if (!msg) {
> +		pr_err("%s: invalid msg pointer", __func__);
> +		return -EINVAL;
> +	}
> +
> +	cmd_blk =
> +	    vc_vchi_cmd_create(instance, msg_id, msg, msg_size, wait_reply);
> +	if (!cmd_blk) {
> +		pr_err("[%s]: failed to allocate global tracking resource",
> +		       __func__);
> +		return -ENOMEM;
> +	}
> +
> +	if (cur_trans_id)
> +		*cur_trans_id = cmd_blk->id;
> +
> +	mutex_lock(&instance->lock);
> +	list_add_tail(&cmd_blk->head, &instance->cmd_list);
> +	mutex_unlock(&instance->lock);
> +	complete(&instance->io_cmplt);
> +
> +	if (!wait_reply)
> +		/* We're done */
> +		return 0;
> +
> +	/* Wait for the response */
> +	if (wait_for_completion_interruptible(&cmd_blk->cmplt)) {
> +		mutex_lock(&instance->lock);
> +		if (!cmd_blk->sent) {
> +			list_del(&cmd_blk->head);
> +			mutex_unlock(&instance->lock);
> +			vc_vchi_cmd_delete(instance, cmd_blk);
> +			return -ENXIO;
> +		}
> +
> +		list_move(&cmd_blk->head, &instance->dead_list);
> +		mutex_unlock(&instance->lock);
> +		complete(&instance->io_cmplt);
> +		return -EINTR;	/* We're done */
> +	}
> +
> +	if (result && result_size) {
> +		memcpy(result, cmd_blk->msg, result_size);
> +	} else {
> +		struct vc_sm_result_t *res =
> +			(struct vc_sm_result_t *)cmd_blk->msg;
> +		status = (res->success == 0) ? 0 : -ENXIO;
> +	}
> +
> +	mutex_lock(&instance->lock);
> +	list_del(&cmd_blk->head);
> +	mutex_unlock(&instance->lock);
> +	vc_vchi_cmd_delete(instance, cmd_blk);
> +	return status;
> +}
> +
> +int vc_sm_cma_vchi_free(struct sm_instance *handle, struct vc_sm_free_t *msg,
> +			u32 *cur_trans_id)
> +{
> +	return vc_sm_cma_vchi_send_msg(handle, VC_SM_MSG_TYPE_FREE,
> +				   msg, sizeof(*msg), 0, 0, cur_trans_id, 0);
> +}
> +
> +int vc_sm_cma_vchi_import(struct sm_instance *handle, struct vc_sm_import *msg,
> +			  struct vc_sm_import_result *result, u32 *cur_trans_id)
> +{
> +	return vc_sm_cma_vchi_send_msg(handle, VC_SM_MSG_TYPE_IMPORT,
> +				   msg, sizeof(*msg), result, sizeof(*result),
> +				   cur_trans_id, 1);
> +}
> +
> +int vc_sm_cma_vchi_client_version(struct sm_instance *handle,
> +				  struct vc_sm_version *msg,
> +				  struct vc_sm_result_t *result,
> +				  u32 *cur_trans_id)
> +{
> +	return vc_sm_cma_vchi_send_msg(handle, VC_SM_MSG_TYPE_CLIENT_VERSION,
> +				   //msg, sizeof(*msg), result, sizeof(*result),
> +				   //cur_trans_id, 1);
> +				   msg, sizeof(*msg), NULL, 0,
> +				   cur_trans_id, 0);
> +}
> +
> +int vc_sm_vchi_client_vc_mem_req_reply(struct sm_instance *handle,
> +				       struct vc_sm_vc_mem_request_result *msg,
> +				       uint32_t *cur_trans_id)
> +{
> +	return vc_sm_cma_vchi_send_msg(handle,
> +				       VC_SM_MSG_TYPE_VC_MEM_REQUEST_REPLY,
> +				       msg, sizeof(*msg), 0, 0, cur_trans_id,
> +				       0);
> +}
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.h b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.h
> new file mode 100644
> index 000000000000..a4f40d4cef05
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_cma_vchi.h
> @@ -0,0 +1,63 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * VideoCore Shared Memory CMA allocator
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + * Copyright 2011-2012 Broadcom Corporation.  All rights reserved.
> + *
> + * Based on vmcs_sm driver from Broadcom Corporation.
> + *
> + */
> +
> +#ifndef __VC_SM_CMA_VCHI_H__INCLUDED__

This is quite inconsistent with the existing coding style of
vc04_services and of the kernel as a whole. I'd go for VC_SM_CMA_VCHI_H,
or if preferred, __VC_SM_CMA_VCHI_H or __VC_SM_CMA_VCHI_H__.

> +#define __VC_SM_CMA_VCHI_H__INCLUDED__
> +
> +#include <linux/raspberrypi/vchiq.h>
> +
> +#include "vc_sm_defs.h"
> +
> +/*
> + * Forward declare.
> + */

Drop this.

> +struct sm_instance;
> +
> +typedef void (*vpu_event_cb)(struct sm_instance *instance,
> +			     struct vc_sm_result_t *reply, int reply_len);
> +
> +/*
> + * Initialize the shared memory service, opens up vchi connection to talk to it.
> + */
> +struct sm_instance *vc_sm_cma_vchi_init(struct vchiq_instance *vchi_instance,
> +					unsigned int num_connections,
> +					vpu_event_cb vpu_event);
> +
> +/*
> + * Terminates the shared memory service.
> + */
> +int vc_sm_cma_vchi_stop(struct vchiq_instance *vchi_instance, struct sm_instance **handle);
> +
> +/*
> + * Ask the shared memory service to free up some memory that was previously
> + * allocated by the vc_sm_cma_vchi_alloc function call.
> + */
> +int vc_sm_cma_vchi_free(struct sm_instance *handle, struct vc_sm_free_t *msg,
> +			u32 *cur_trans_id);
> +
> +/*
> + * Import a contiguous block of memory and wrap it in a GPU MEM_HANDLE_T.
> + */
> +int vc_sm_cma_vchi_import(struct sm_instance *handle, struct vc_sm_import *msg,
> +			  struct vc_sm_import_result *result,
> +			  u32 *cur_trans_id);
> +
> +int vc_sm_cma_vchi_client_version(struct sm_instance *handle,
> +				  struct vc_sm_version *msg,
> +				  struct vc_sm_result_t *result,
> +				  u32 *cur_trans_id);
> +
> +int vc_sm_vchi_client_vc_mem_req_reply(struct sm_instance *handle,
> +				       struct vc_sm_vc_mem_request_result *msg,
> +				       uint32_t *cur_trans_id);
> +
> +#endif /* __VC_SM_CMA_VCHI_H__INCLUDED__ */
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm_defs.h b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_defs.h
> new file mode 100644
> index 000000000000..ad4a3ec194d3
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_defs.h
> @@ -0,0 +1,187 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * VideoCore Shared Memory CMA allocator
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + *
> + * Based on vc_sm_defs.h from the vmcs_sm driver Copyright Broadcom Corporation.
> + * All IPC messages are copied across to this file, even if the vc-sm-cma
> + * driver is not currently using them.
> + *
> + ****************************************************************************
> + */
> +
> +#ifndef __VC_SM_DEFS_H__INCLUDED__
> +#define __VC_SM_DEFS_H__INCLUDED__
> +
> +/* Maximum message length */
> +#define VC_SM_MAX_MSG_LEN (sizeof(union vc_sm_msg_union_t) + \
> +	sizeof(struct vc_sm_msg_hdr_t))
> +#define VC_SM_MAX_RSP_LEN (sizeof(union vc_sm_msg_union_t))
> +
> +/* Resource name maximum size */
> +#define VC_SM_RESOURCE_NAME 32
> +
> +/*
> + * Version to be reported to the VPU
> + * VPU assumes 0 (aka 1) which does not require the released callback, nor
> + * expect the client to handle VC_MEM_REQUESTS.
> + * Version 2 requires the released callback, and must support VC_MEM_REQUESTS.
> + */
> +#define VC_SM_PROTOCOL_VERSION	2
> +
> +enum vc_sm_msg_type {
> +	/* Message types supported for HOST->VC direction */
> +
> +	/* Allocate shared memory block */
> +	VC_SM_MSG_TYPE_ALLOC,
> +	/* Lock allocated shared memory block */
> +	VC_SM_MSG_TYPE_LOCK,
> +	/* Unlock allocated shared memory block */
> +	VC_SM_MSG_TYPE_UNLOCK,
> +	/* Unlock allocated shared memory block, do not answer command */
> +	VC_SM_MSG_TYPE_UNLOCK_NOANS,
> +	/* Free shared memory block */
> +	VC_SM_MSG_TYPE_FREE,
> +	/* Resize a shared memory block */
> +	VC_SM_MSG_TYPE_RESIZE,
> +	/* Walk the allocated shared memory block(s) */
> +	VC_SM_MSG_TYPE_WALK_ALLOC,
> +
> +	/* A previously applied action will need to be reverted */
> +	VC_SM_MSG_TYPE_ACTION_CLEAN,
> +
> +	/*
> +	 * Import a physical address and wrap into a MEM_HANDLE_T.
> +	 * Release with VC_SM_MSG_TYPE_FREE.
> +	 */
> +	VC_SM_MSG_TYPE_IMPORT,
> +	/*
> +	 *Tells VC the protocol version supported by this client.
> +	 * 2 supports the async/cmd messages from the VPU for final release
> +	 * of memory, and for VC allocations.
> +	 */
> +	VC_SM_MSG_TYPE_CLIENT_VERSION,
> +	/* Response to VC request for memory */
> +	VC_SM_MSG_TYPE_VC_MEM_REQUEST_REPLY,
> +
> +	/*
> +	 * Asynchronous/cmd messages supported for VC->HOST direction.
> +	 * Signalled by setting the top bit in vc_sm_result_t trans_id.
> +	 */
> +
> +	/*
> +	 * VC has finished with an imported memory allocation.
> +	 * Release any Linux reference counts on the underlying block.
> +	 */
> +	VC_SM_MSG_TYPE_RELEASED,
> +	/* VC request for memory */
> +	VC_SM_MSG_TYPE_VC_MEM_REQUEST,
> +
> +	VC_SM_MSG_TYPE_MAX
> +};
> +
> +/* Type of memory to be allocated */
> +enum vc_sm_alloc_type_t {
> +	VC_SM_ALLOC_CACHED,
> +	VC_SM_ALLOC_NON_CACHED,
> +};
> +
> +/* Message header for all messages in HOST->VC direction */
> +struct vc_sm_msg_hdr_t {
> +	u32 type;
> +	u32 trans_id;
> +	u8 body[0];
> +
> +};
> +
> +/* Request to free a previously allocated memory (HOST->VC) */
> +struct vc_sm_free_t {
> +	/* Resource handle (returned from alloc) */
> +	u32 res_handle;
> +	/* Resource buffer (returned from alloc) */
> +	u32 res_mem;
> +
> +};
> +
> +/* Generic result for a request (VC->HOST) */
> +struct vc_sm_result_t {
> +	/* Transaction identifier */
> +	u32 trans_id;
> +
> +	s32 success;
> +
> +};
> +
> +/* Request to import memory (HOST->VC) */
> +struct vc_sm_import {
> +	/* type of memory to allocate */
> +	enum vc_sm_alloc_type_t type;
> +	/* pointer to the VC (ie physical) address of the allocated memory */
> +	u32 addr;
> +	/* size of buffer */
> +	u32 size;
> +	/* opaque handle returned in RELEASED messages */
> +	u32 kernel_id;
> +	/* Allocator identifier */
> +	u32 allocator;
> +	/* resource name (for easier tracking on vc side) */
> +	char     name[VC_SM_RESOURCE_NAME];
> +};
> +
> +/* Result of a requested memory import (VC->HOST) */
> +struct vc_sm_import_result {
> +	/* Transaction identifier */
> +	u32 trans_id;
> +
> +	/* Resource handle */
> +	u32 res_handle;
> +};
> +
> +/* Notification that VC has finished with an allocation (VC->HOST) */
> +struct vc_sm_released {
> +	/* cmd type / trans_id */
> +	u32 cmd;
> +
> +	/* pointer to the VC (ie physical) address of the allocated memory */
> +	u32 addr;
> +	/* size of buffer */
> +	u32 size;
> +	/* opaque handle returned in RELEASED messages */
> +	u32 kernel_id;
> +	u32 vc_handle;
> +};
> +
> +/*
> + * Client informing VC as to the protocol version it supports.
> + * >=2 requires the released callback, and supports VC asking for memory.
> + * Failure means that the firmware doesn't support this call, and therefore the
> + * client should either fail, or NOT rely on getting the released callback.
> + */
> +struct vc_sm_version {
> +	u32 version;
> +};
> +
> +/* Response from the kernel to provide the VPU with some memory */
> +struct vc_sm_vc_mem_request_result {
> +	/* Transaction identifier for the VPU */
> +	u32 trans_id;
> +	/* pointer to the physical address of the allocated memory */
> +	u32 addr;
> +	/* opaque handle returned in RELEASED messages */
> +	u32 kernel_id;
> +};
> +
> +/* Union of ALL messages */
> +union vc_sm_msg_union_t {
> +	struct vc_sm_free_t free;
> +	struct vc_sm_result_t result;
> +	struct vc_sm_import import;
> +	struct vc_sm_import_result import_result;
> +	struct vc_sm_version version;
> +	struct vc_sm_released released;
> +	struct vc_sm_vc_mem_request_result vc_request_result;
> +};
> +
> +#endif /* __VC_SM_DEFS_H__INCLUDED__ */
> diff --git a/drivers/staging/vc04_services/vc-sm-cma/vc_sm_knl.h b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_knl.h
> new file mode 100644
> index 000000000000..988fdd967922
> --- /dev/null
> +++ b/drivers/staging/vc04_services/vc-sm-cma/vc_sm_knl.h
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * VideoCore Shared Memory CMA allocator
> + *
> + * Copyright: 2018, Raspberry Pi (Trading) Ltd
> + *
> + * Based on vc_sm_defs.h from the vmcs_sm driver Copyright Broadcom Corporation.
> + *
> + */
> +
> +#ifndef __VC_SM_KNL_H__INCLUDED__
> +#define __VC_SM_KNL_H__INCLUDED__
> +
> +#if !defined(__KERNEL__)
> +#error "This interface is for kernel use only..."
> +#endif

As this header isn't available to userspace, I think you can drop this.

> +
> +/* Free a previously allocated or imported shared memory handle and block. */

kerneldoc would be nice for such an inter-driver API.

> +int vc_sm_cma_free(void *handle);
> +
> +/* Get an internal resource handle mapped from the external one. */
> +int vc_sm_cma_int_handle(void *handle);
> +
> +/* Import a block of memory into the GPU space. */
> +int vc_sm_cma_import_dmabuf(struct dma_buf *dmabuf, void **handle);

I would order these function in the expected call order: import,
int_handle and free.

> +
> +#endif /* __VC_SM_KNL_H__INCLUDED__ */

-- 
Regards,

Laurent Pinchart