Re: [PATCH 2/2] drm/nvdla: Add driver support for NVDLA

Cai Huoqing <cai.huoqing@xxxxxxxxx> · Mon, 25 Apr 2022 22:28:57 +0800



On 22 4月 22 01:01:14, Kari Argillander wrote:
> This is just quick look up. I basically check some style issues and did
> some basic static analyzing.
> 
> I have run
>  - cppcheck (which found couple mistakes)
>  - flawfinder (did not found anything to my eyes)
>  - codespell (did find couple typo)
> 
> You can run these yourself also or check below.
> 
> Couple common things which you can ignore or not	.
> - Usually in this code there is goto exit and it is just return. Maybe
>   use just return straight away. No need to jump.
> - Some comments start capital others not. Maybe all should start
>   capital. Very small nit, but makes nice touch to the code.
> - Lot of oneline comments are unneccessary three line comments.
> 
> On 19.4.2022 16.59, Cai Huoqing wrote:
> > The NVIDIA Deep Learning Accelerator (NVDLA) is an open source IP
> > which is integrated into NVIDIA Jetson AGX Xavier,
> > so add driver support for this accelerator.
> > 
> > Signed-off-by: Cai Huoqing <cai.huoqing@xxxxxxxxx>
> 
> ... snip
> 
> > diff --git a/drivers/gpu/drm/nvdla/nvdla_bdma.c b/drivers/gpu/drm/nvdla/nvdla_bdma.c
> > new file mode 100644
> > index 000000000000..225613f27acf
> > --- /dev/null
> > +++ b/drivers/gpu/drm/nvdla/nvdla_bdma.c
> 
> ... snip
> 
> > +static int32_t
> > +processor_bdma_program_slot(struct dla_engine *engine,
> > +							struct dla_bdma_surface_desc *bdma_surface,
> > +							struct dla_bdma_transfer_desc *transfer)
> > +{
> > +	int32_t ret = 0;
> > +	uint64_t source_addr = 0;
> > +	uint64_t destination_addr = 0;
> > +	uint32_t high, low, reg;
> > +	uint8_t  bdma_free_slots = 0;
> > +
> > +	/* make sure there're enough free slots */
> > +	if (bdma_free_slots <= 0) {
> 
> This is always true right now.
> 
> > +		do {
> > +			reg = bdma_reg_read(engine, STATUS);
> > +			reg = (reg & MASK(BDMA_STATUS_0, FREE_SLOT)) >>
> > +					SHIFT(BDMA_STATUS_0, FREE_SLOT);
> > +		} while (reg == 0);
> > +		bdma_free_slots = (uint8_t)reg;
> > +	}
> > +
> > +	dla_get_dma_address(engine->driver_context, engine->task->task_data,
> > +						transfer->source_address,
> > +						(void *)&source_addr,
> > +						DESTINATION_DMA);
> > +	dla_get_dma_address(engine->driver_context, engine->task->task_data,
> > +						transfer->destination_address,
> > +						(void *)&destination_addr,
> > +						DESTINATION_DMA);
> > +
> > +	ASSERT_GOTO((transfer->line_repeat <= 8192),
> > +				ret, -EINVAL, exit);
> > +	ASSERT_GOTO((transfer->surface_repeat <= 8192),
> > +				ret, -EINVAL, exit);
> > +	ASSERT_GOTO((transfer->line_size % 32) == 0,
> > +				ret, -EINVAL, exit);
> > +	ASSERT_GOTO(transfer->source_line >= transfer->line_size,
> > +				ret, -EINVAL, exit);
> > +	ASSERT_GOTO(transfer->destination_line >= transfer->line_size,
> > +				ret, -EINVAL, exit);
> > +	ASSERT_GOTO(transfer->source_surface >=
> > +			(transfer->source_line * transfer->line_repeat),
> > +				ret, -EINVAL, exit);
> > +	ASSERT_GOTO(transfer->destination_surface >=
> > +			(transfer->destination_line * transfer->line_repeat),
> > +				ret, -EINVAL, exit);
> > +
> > +	/* config registers */
> > +	high = upper_32_bits(source_addr);
> > +	low = lower_32_bits(source_addr);
> > +	bdma_reg_write(engine, CFG_SRC_ADDR_LOW, low);
> > +	bdma_reg_write(engine, CFG_SRC_ADDR_HIGH, high);
> > +	high = upper_32_bits(destination_addr);
> > +	low = lower_32_bits(destination_addr);
> > +	bdma_reg_write(engine, CFG_DST_ADDR_LOW, low);
> > +	bdma_reg_write(engine, CFG_DST_ADDR_HIGH, high);
> > +	bdma_reg_write(engine, CFG_LINE, (transfer->line_size >> 5) - 1);
> > +	reg = (map_mem[bdma_surface->source_type] <<
> > +				SHIFT(BDMA_CFG_CMD_0, SRC_RAM_TYPE)) |
> > +		(map_mem[bdma_surface->destination_type] <<
> > +				SHIFT(BDMA_CFG_CMD_0, DST_RAM_TYPE));
> > +	bdma_reg_write(engine, CFG_CMD, reg);
> > +	bdma_reg_write(engine, CFG_LINE_REPEAT, transfer->line_repeat - 1);
> > +	bdma_reg_write(engine, CFG_SRC_LINE, transfer->source_line);
> > +	bdma_reg_write(engine, CFG_DST_LINE, transfer->destination_line);
> > +	bdma_reg_write(engine, CFG_SURF_REPEAT, transfer->surface_repeat - 1);
> > +	bdma_reg_write(engine, CFG_SRC_SURF, transfer->source_surface);
> > +	bdma_reg_write(engine, CFG_DST_SURF, transfer->destination_surface);
> > +	bdma_reg_write(engine, CFG_OP, FIELD_ENUM(BDMA_CFG_OP_0, EN, ENABLE));
> > +
> > +exit:
> > +	return ret;
> > +}
> 
> ... snip
> 
> > diff --git a/drivers/gpu/drm/nvdla/nvdla_cache.c b/drivers/gpu/drm/nvdla/nvdla_cache.c
> > new file mode 100644
> > index 000000000000..f8bd7b514aab
> > --- /dev/null
> > +++ b/drivers/gpu/drm/nvdla/nvdla_cache.c
> > @@ -0,0 +1,215 @@
> > +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
> > +/*
> > + * Copyright (C) 2017-2018 NVIDIA CORPORATION
> > + * Copyright (C) 2022 Cai Huoqing
> > + */
> > +
> > +#include "nvdla_common.h"
> > +#include "nvdla_drm.h"
> > +#include "nvdla_reg.h"
> > +#include "nvdla_engine.h"
> > +
> > +#define DLA_OP_CACHE_SIZE (DLA_NUM_GROUPS * ((DLA_OP_NUM + 2) * 2))
> > +
> > +static struct dla_common_op_desc desc_cache[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
> > +static int32_t desc_refcount[DLA_OP_NUM][DLA_OP_CACHE_SIZE];
> > +
> > +void
> > +dla_get_refcount(struct dla_common_op_desc *op_desc)
> > +{
> > +	int32_t i;
> > +	struct dla_common_op_desc *desc = NULL;
> > +
> > +	if (op_desc == NULL)
> > +		return;
> > +
> > +	if (op_desc->index == -1)
> > +		return;
> > +
> > +	desc = &desc_cache[op_desc->op_type][0];
> > +
> > +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> > +		if (desc->index == op_desc->index &&
> > +				desc->roi_index == op_desc->roi_index) {
> 
> reverse if
> 
> 		if (desc->index != op_desc->index)
> 			continue;
> 		if (desc->roi_index != op_desc->roi_index)
> 			continue;
> 
> > +			desc_refcount[op_desc->op_type][i]++;
> > +			return;
> > +		}
> > +	}
> > +}
> > +
> > +struct dla_common_op_desc *
> > +dla_get_op_desc(struct dla_engine *engine,
> > +				struct dla_task *task, int16_t index,
> > +				uint8_t op_type, uint8_t roi_index)
> > +{
> > +	int32_t i;
> > +	int32_t ret;
> > +	uint64_t op_base;
> > +	uint64_t dep_graph_addr;
> > +	struct dla_common_op_desc *desc = NULL;
> > +
> > +	if (index == -1) {
> > +		pr_debug("no desc get due to index==-1\n");
> > +		goto exit;
> > +	}
> > +
> > +	dep_graph_addr = (sizeof(struct dla_common_op_desc) *
> > +				engine->network->num_operations * roi_index);
> > +
> > +	desc = &desc_cache[op_type][0];
> > +
> > +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> > +		if (desc->index == index && desc->roi_index == roi_index) {
> > +			if (desc->op_type != op_type) {
> > +				pr_err("op_cache[op=%u] contains incorrect entry of op[%u]\n",
> > +					   op_type, desc->op_type);
> > +				continue;
> > +			}
> 
> reverse if so this will be pretty clean
> 
> 		if (desc->index != index)
> 			continue;
> 		if (desc->roi_index != roi_index)
> 			continue;
> 		if (desc->op_type != op_type) {
> 			pr_err("op_cache[op=%u] contains incorrect entry of op[%u]\n",
> 					op_type, desc->op_type);
> 			continue;
> 		}
> 
> 
> > +			desc_refcount[op_type][i]++;
> > +			goto exit;
> > +		}
> > +	}
> > +
> > +	desc = &desc_cache[op_type][0];
> > +
> > +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> > +		if (desc->index == -1) {
> 
> reverse if
> 		if (desc->index != -1)
> 			continue;
> 
> > +			op_base = dep_graph_addr +
> > +					(sizeof(struct dla_common_op_desc) *
> > +					(uint64_t)index);
> > +			ret = dla_data_read(engine->driver_context,
> > +					task->task_data,
> > +					task->dependency_graph_addr,
> > +					(void *)(desc),
> > +					sizeof(struct dla_common_op_desc),
> > +					op_base);
> > +			if (ret) {
> > +				desc = NULL;
> > +				goto exit;
> > +			}
> > +
> > +			if (op_type != desc->op_type) {
> > +				/*
> > +				 * op_type of entry read from DRAM should not
> > +				 * mismatch with given op_type. If they
> > +				 * mismatches, then wrong entry is fetched, so
> > +				 * report this issue by throwing error.
> > +				 */
> > +				pr_err("Fetched [op_type=%u] from DRAM doesn't match with op_type[%u]\n",
> > +					   desc->op_type, op_type);
> > +				desc->op_type = op_type;
> > +				desc->index = -1;
> > +				desc->roi_index = -1;
> > +				desc = NULL;
> > +				goto exit;
> > +			}
> > +
> > +			desc->index = index;
> > +			desc->roi_index = roi_index;
> > +
> > +			desc_refcount[op_type][i]++;
> > +			goto exit;
> > +		}
> > +	}
> > +
> > +exit:
> > +	return desc;
> > +}
> > +
> > +static void
> > +dla_free_op_desc(struct dla_engine *engine, struct dla_common_op_desc *op_desc)
> > +{
> > +	uint64_t op_base;
> > +	uint64_t dep_graph_addr;
> > +	struct dla_task *task;
> > +
> > +	pr_debug("Enter: %s op desc index %u ROI %d\n", __func__,
> > +				op_desc->index, op_desc->roi_index);
> 
> Possiple null pointer dereference
> 
> > +	task = engine->task;
> > +	dep_graph_addr = (sizeof(struct dla_common_op_desc) *
> > +				engine->network->num_operations *
> > +				op_desc->roi_index);
> > +
> > +	if (op_desc->index == -1)
> > +		goto exit;
> 
> Possiple null pointer dereference
> 
> > +	if (op_desc == NULL)
> > +		goto exit;
> 
> Or this is unnecessary.
> 
> > +
> > +	/**
> > +	 * TODO: keeping the depth value hardcoded as 0 for now,
> > +	 * need to replace it once corresponding implementation is done.
> > +	 */
> > +	op_base = (dep_graph_addr +
> > +			(sizeof(struct dla_common_op_desc) *
> > +			(uint64_t)op_desc->index));
> > +
> > +	/**
> > +	 * Flush descriptor to DRAM
> > +	 */
> > +	dla_data_write(engine->driver_context,
> > +			task->task_data,
> > +			(void *)op_desc,
> > +			task->dependency_graph_addr,
> > +			sizeof(struct dla_common_op_desc),
> > +			op_base);
> > +
> > +	/**
> > +	 * Release it
> > +	 */
> > +	op_desc->index = -1;
> > +	op_desc->roi_index = -1;
> > +exit:
> > +	return;
> > +}
> > +
> > +void
> > +dla_put_op_desc(struct dla_engine *engine, struct dla_common_op_desc *op_desc)
> > +{
> > +	int32_t i;
> > +	struct dla_common_op_desc *desc;
> > +
> > +	if (op_desc == NULL)
> > +		return;
> > +
> > +	if (op_desc->index == -1)
> > +		return;
> > +
> > +	desc = &desc_cache[op_desc->op_type][0];
> > +
> > +	for (i = 0; i < DLA_OP_CACHE_SIZE; i++, desc++) {
> > +		if (desc->index == op_desc->index &&
> > +				desc->roi_index == op_desc->roi_index) {
> 
> Reverse if.
> 
> 		if (desc->index != op_desc->index)
> 			continue;
> 		if (desc->roi_index != op_desc->roi_index)
> 			continue;
> 
> > +
> > +			desc_refcount[op_desc->op_type][i]--;
> > +
> > +			/**
> > +			 * Free desc if refcount is 0
> > +			 */
> Pretty useless comment and totally not needed three line for this.
> 
> > +			if (desc_refcount[op_desc->op_type][i] == 0)
> > +				dla_free_op_desc(engine, op_desc);
> > +
> > +			return;
> > +		}
> > +	}
> > +}
> > +
> > +void
> > +dla_init_op_cache(struct dla_engine *engine)
> > +{
> > +	int32_t i, j;
> > +	struct dla_common_op_desc *desc = &desc_cache[0][0];
> > +
> > +	memset((uint8_t *)&desc_cache[0][0], 0, sizeof(desc_cache));
> > +	memset((uint8_t *)&desc_refcount[0][0], 0, sizeof(desc_refcount));
> > +
> > +	for (i = 0; i < DLA_OP_NUM; i++) {
> > +		for (j = 0; j < DLA_OP_CACHE_SIZE; j++) {
> > +			desc->index = -1;
> > +			desc->roi_index = -1;
> > +			desc->op_type = (uint8_t)i;
> > +			desc++;
> > +		}
> > +	}
> > +}
> 
> ... snip
> 
> > diff --git a/drivers/gpu/drm/nvdla/nvdla_common.h b/drivers/gpu/drm/nvdla/nvdla_common.h
> > new file mode 100644
> > index 000000000000..38cf43246890
> > --- /dev/null
> > +++ b/drivers/gpu/drm/nvdla/nvdla_common.h
> > @@ -0,0 +1,835 @@
> 
> ... snip
> 
> > +struct dla_conv_op_desc {
> > +	/* Performance parameters */
> > +
> > +	/* dla_conv_mode */
> > +	uint8_t conv_mode;
> > +	uint8_t data_reuse;
> > +	uint8_t weight_reuse;
> > +	uint8_t skip_data_rls;
> > +
> > +	uint8_t skip_weight_rls;
> > +	uint8_t reserved0;
> > +	uint16_t entry_per_slice;
> > +
> > +	/* dla_data_format */
> > +	uint8_t data_format;
> > +	/* dla_pixel_mapping */
> > +	uint8_t pixel_mapping;
> > +	/* number of free slices before fetch */
> > +	uint16_t fetch_grain;
> > +
> > +	uint8_t reserved_b[8];
> > +
> > +	/* batch_num */
> > +	uint8_t batch;
> > +	/* dla_weight_format */
> > +	uint8_t weight_format;
> > +	uint8_t data_bank;
> > +	uint8_t weight_bank;
> > +
> > +	/* the offset in bytes of each data cube in a batch */
> > +	uint32_t batch_stride;
> > +
> > +	uint8_t post_extension;
> > +	uint8_t pixel_override;
> > +	/* number of slices need to be released */
> > +	uint16_t release;
> > +
> > +	 /* The input cube dimension for CSC */
> > +	uint16_t input_width_csc;
> > +	uint16_t input_height_csc;
> > +
> > +	uint16_t input_channel_csc;
> > +	uint16_t kernel_width_csc;
> > +
> > +	uint16_t kernel_height_csc;
> > +	uint16_t kernel_channel_csc;
> > +
> > +	/* The input cube dimension for CMAC */
> > +	uint16_t input_width_cmac;
> > +	uint16_t input_height_cmac;
> > +
> > +	/* actual size in bytes */
> > +	uint32_t bytes_per_kernel;
> > +
> > +	/* Algorithm parameters */
> > +
> > +	int16_t mean_ry; /* mean value for red in RGB or Y in YUV */
> > +	int16_t mean_gu; /* mean value for green in RGB or U in YUV */
> > +
> > +	int16_t mean_bv; /* mean value for blue in RGB or V in YUV */
> > +	int16_t mean_ax;
> > +
> > +	uint8_t mean_format; /* dla_mean_format */
> > +	uint8_t conv_stride_x;
> > +	uint8_t conv_stride_y;
> > +	uint8_t pad_x_left;
> > +
> > +	uint8_t pad_x_right;
> > +	uint8_t pad_y_top;
> > +	uint8_t pad_y_bottom;
> > +	uint8_t dilation_x;
> > +
> > +	uint8_t dilation_y;
> > +	uint8_t reserved2[2];
> > +
> > +	/* Precision parameters */
> > +	uint8_t pra_truncate;
> > +
> > +	uint8_t in_precision;
> > +	/* The output precision from CONV, it's the MAC processing precison */
> 
> ./nvdla_common.h:428: precison ==> precision
> 
> > +	uint8_t out_precision;
> > +	int16_t pad_val;
> > +
> > +	/* input converter parameters */
> > +	struct dla_cvt_param in_cvt;
> > +	/* output converter parameters, support truncate only */
> > +	struct dla_cvt_param out_cvt;
> > +
> > +} __packed __aligned(4);
> > +
> > +struct dla_conv_stat_desc {
> > +	uint32_t data_read_stall;
> > +	uint32_t weight_read_stall;
> > +	uint32_t data_read_latency;
> > +	uint32_t weight_read_latency;
> > +	uint32_t saturation_count;
> > +	uint32_t nan_data_num;
> > +	uint32_t nan_weight_num;
> > +	uint32_t inf_data_num;
> > +	uint32_t inf_weight_num;
> > +} __packed __aligned(4);
> > +
> > +/**
> > + * @ingroup SDP
> > + * @name Activation functions
> > + * @brief Activation functions supported in SDP
> > + * @{
> > + */
> > +#define ACTIVATION_NONE		0
> > +#define ACTIVATION_RELU		1
> > +#define ACTIVATION_LUT		2
> > +#define ACTIVATION_PRELU	3
> > +/** @} */
> > +
> > +/**
> > + * @ingroup LUT
> > + * @name LUT size
> > + * @brief LUT sizes for linear and exponentila LUT
> > + * @{
> > + */
> > +#define LUT_LINEAR_EXP_TABLE_ENTRY_LOG2		6
> > +#define LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2	8
> > +/** @} */
> > +
> > +/**
> > + * @ingroup LUT
> > + * @name LUT types
> > + * @brief DLA supports two types of LUT, linear and exonential
> > + * @{
> > + */
> > +#define LUT_LINEAR_EXP_TABLE		0
> > +#define LUT_LINEAR_ONLY_TABLE		1
> > +/** @} */
> > +
> > +/**
> > + * @ingroup LUT
> > + * @name LUT methods
> > + * @brief DLA supports two types of LUT, linear and exonential
> > + * @{
> > + */
> > +#define LUT_METHOD_EXPONENTIAL		0
> > +#define LUT_METHOD_LINEAR		1
> > +/** @} */
> > +
> > +/**
> > + * @ingroup LUT
> > + * @name LUT
> > + * @brief DLA supports two types of LUT, linear and exonential
> > + * @{
> > + */
> > +#define LUT_PRI_LINEAR_EXP		0
> > +#define LUT_PRI_LINEAR_ONLY		1
> > +/** @} */
> > +
> > +union dla_lut_offset {
> > +	/**
> > +	 * Number should be substracted on log domain before look up
> 
> ./nvdla_common.h:505: substracted ==> subtracted
> 
> > +	 * exponetial table it has the same definition as hardware
> 
> ./nvdla_common.h:506: exponetial ==> exponential
> 
> > +	 * thus input scaling should also take into account when
> > +	 * set this field.
> > +	 */
> > +	int8_t exp_offset;
> > +	/**
> > +	 * Number of bits should be right shift before looking
> > +	 * up linear table
> > +	 */
> > +	int8_t frac_bits;
> > +	uint16_t reserved0;
> > +};
> 
> ... snip
> 
> > diff --git a/drivers/gpu/drm/nvdla/nvdla_drm.c b/drivers/gpu/drm/nvdla/nvdla_drm.c
> > new file mode 100644
> > index 000000000000..9217eee1de3b
> > --- /dev/null
> > +++ b/drivers/gpu/drm/nvdla/nvdla_drm.c
> > @@ -0,0 +1,695 @@
> > +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
> > +/*
> > + * Copyright (C) 2017-2018 NVIDIA CORPORATION
> > + * Copyright (C) 2022 Cai Huoqing
> > + */
> > +
> > +#include <linux/dma-buf.h>
> > +#include <linux/dma-mapping.h>
> > +#include <linux/fs.h>
> > +#include <linux/interrupt.h>
> > +#include <linux/irq.h>
> > +#include <linux/irqdomain.h>
> > +#include <linux/module.h>
> > +#include <linux/of.h>
> > +#include <linux/of_device.h>
> > +#include <linux/of_irq.h>
> > +#include <linux/of_platform.h>
> > +#include <linux/platform_device.h>
> > +#include <linux/printk.h>
> > +#include <linux/slab.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/time.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/types.h>
> > +
> > +#include "nvdla_drm.h"
> > +#include "nvdla_ioctl.h"
> > +#include "nvdla_engine.h"
> > +
> > +static struct nvdla_config nvdla_config_os_initial = {
> > +	.atom_size = 32,
> > +	.bdma_enable = true,
> > +	.rubik_enable = true,
> > +	.weight_compress_support = true,
> > +};
> > +
> > +static struct nvdla_config nvdla_config_small = {
> > +	//.atom_size = 8,
> > +	.atom_size = 32,  // nv_large config
> > +	.bdma_enable = false,
> > +	.rubik_enable = false,
> > +	.weight_compress_support = false,
> > +};
> > +
> > +int64_t dla_get_time_us(void)
> 
> Funtion is never used.
> 
> > +{
> > +	return ktime_get_ns() / NSEC_PER_USEC;
> > +}
> > +
> > +void dla_reg_write(void *driver_context, uint32_t addr, uint32_t reg)
> > +{
> > +	struct nvdla_device *nvdla_dev =
> > +			(struct nvdla_device *)driver_context;
> > +
> > +	if (!nvdla_dev)
> > +		return;
> > +
> > +	writel(reg, nvdla_dev->base + addr);
> > +}
> > +
> > +uint32_t dla_reg_read(void *driver_context, uint32_t addr)
> > +{
> > +	struct nvdla_device *nvdla_dev =
> > +			(struct nvdla_device *)driver_context;
> > +
> > +	if (!nvdla_dev)
> > +		return 0;
> > +
> > +	return readl(nvdla_dev->base + addr);
> > +}
> > +
> > +static irqreturn_t nvdla_engine_isr(int32_t irq, void *data)
> > +{
> > +	unsigned long flags;
> > +	uint32_t mask;
> > +	uint32_t reg;
> > +	struct dla_processor *processor = NULL;
> > +	struct dla_processor_group *group;
> > +	struct dla_engine *engine;
> > +	struct nvdla_device *nvdla_dev = (struct nvdla_device *)data;
> > +
> > +	if (!nvdla_dev)
> > +		return IRQ_NONE;
> > +
> > +	engine = nvdla_dev->engine_context;
> > +	spin_lock_irqsave(&nvdla_dev->nvdla_lock, flags);
> > +
> > +	mask = glb_reg_read(engine, S_INTR_MASK);
> 
> Never used. It would be nice so that static analyzer will not complain
> these anymore, but your choice what you want to do.
thanks for your check. this line is an read clear register to clear interrupt,
it'is ok to leave here.
for others, code style and typo. I will try to fix

Thanks,
Cai
> 
> > +	reg = glb_reg_read(engine, S_INTR_STATUS);
> > +
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS0)) {
> > +		processor = &engine->processors[DLA_OP_CONV];
> > +		group = &processor->groups[0];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS1)) {
> > +		processor = &engine->processors[DLA_OP_CONV];
> > +		group = &processor->groups[1];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS0)) {
> > +		processor = &engine->processors[DLA_OP_SDP];
> > +		group = &processor->groups[0];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS1)) {
> > +		processor = &engine->processors[DLA_OP_SDP];
> > +		group = &processor->groups[1];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS0)) {
> > +		processor = &engine->processors[DLA_OP_CDP];
> > +		group = &processor->groups[0];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS1)) {
> > +		processor = &engine->processors[DLA_OP_CDP];
> > +		group = &processor->groups[1];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS0)) {
> > +		processor = &engine->processors[DLA_OP_RUBIK];
> > +		group = &processor->groups[0];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS1)) {
> > +		processor = &engine->processors[DLA_OP_RUBIK];
> > +		group = &processor->groups[1];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS0)) {
> > +		processor = &engine->processors[DLA_OP_PDP];
> > +		group = &processor->groups[0];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS1)) {
> > +		processor = &engine->processors[DLA_OP_PDP];
> > +		group = &processor->groups[1];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS0)) {
> > +		processor = &engine->processors[DLA_OP_BDMA];
> > +		group = &processor->groups[0];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS1)) {
> > +		processor = &engine->processors[DLA_OP_BDMA];
> > +		group = &processor->groups[1];
> > +		group->events |= (1 << DLA_EVENT_OP_COMPLETED);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS0)) {
> > +		processor = &engine->processors[DLA_OP_CONV];
> > +		group = &processor->groups[0];
> > +		group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS1)) {
> > +		processor = &engine->processors[DLA_OP_CONV];
> > +		group = &processor->groups[1];
> > +		group->events |= (1 << DLA_EVENT_CDMA_DT_DONE);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS0)) {
> > +		processor = &engine->processors[DLA_OP_CONV];
> > +		group = &processor->groups[0];
> > +		group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
> > +	}
> > +	if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS1)) {
> > +		processor = &engine->processors[DLA_OP_CONV];
> > +		group = &processor->groups[1];
> > +		group->events |= (1 << DLA_EVENT_CDMA_WT_DONE);
> > +	}
> > +
> > +	glb_reg_write(engine, S_INTR_STATUS, reg);
> > +	mask = glb_reg_read(engine, S_INTR_MASK);
> 
> Never used
> 
> > +	reg = glb_reg_read(engine, S_INTR_STATUS);
> 
> Never used.
> 
> > +
> > +	complete(&nvdla_dev->event_notifier);
> > +	spin_unlock_irqrestore(&nvdla_dev->nvdla_lock, flags);
> > +
> > +	return IRQ_HANDLED;
> > +}
> 
> ... snip
> 
> > diff --git a/drivers/gpu/drm/nvdla/nvdla_gem.c b/drivers/gpu/drm/nvdla/nvdla_gem.c
> > new file mode 100644
> > index 000000000000..cccf6d01a564
> > --- /dev/null
> > +++ b/drivers/gpu/drm/nvdla/nvdla_gem.c
> 
> ... snip
> 
> > +static const struct drm_ioctl_desc nvdla_drm_ioctls[] = {
> > +	DRM_IOCTL_DEF_DRV(NVDLA_SUBMIT, nvdla_submit, DRM_RENDER_ALLOW),
> > +	DRM_IOCTL_DEF_DRV(NVDLA_GEM_CREATE, nvdla_gem_create, DRM_RENDER_ALLOW),
> > +	DRM_IOCTL_DEF_DRV(NVDLA_GEM_MMAP, nvdla_gem_map_offset, DRM_RENDER_ALLOW),
> > +	/* use DRM_IOCTL_MODE_DESTROY_DUMB to destory */
> 
> ./nvdla_gem.c:347: destory ==> destroy
> 
> > +};
> 
> ... snip
> 
> > diff --git a/drivers/gpu/drm/nvdla/nvdla_scheduler.c b/drivers/gpu/drm/nvdla/nvdla_scheduler.c
> > new file mode 100644
> > index 000000000000..b814077478c6
> > --- /dev/null
> > +++ b/drivers/gpu/drm/nvdla/nvdla_scheduler.c
> 
> ... snip
> 
> > +static int
> > +dla_update_dependency(struct dla_engine *engine,
> > +					  struct dla_consumer *consumer,
> > +					  struct dla_common_op_desc *op_desc,
> > +					  uint8_t event, uint8_t roi_index)
> > +{
> > +	int32_t ret = 0;
> > +	struct dla_processor *processor;
> > +
> > +	if (consumer->index == -1)
> > +		goto exit;
> > +
> > +	/* Update dependency only if event matches */
> > +	if (event != consumer->event)
> > +		goto exit;
> > +
> > +	/**
> > +	 * If consumer index is valid but op desc is NULL means
> > +	 * op desc for consumer was not pre-fetched
> > +	 */
> > +	if (op_desc == NULL) {
> > +		ret = -EINVAL;
> > +		pr_err("Operation descriptor is NULL, consumer index %d",
> > +				consumer->index);
> > +		goto exit;
> > +	}
> > +
> > +	pr_debug("Update dependency operation index %d ROI %d DEP_COUNT=%d\n",
> > +					op_desc->index, op_desc->roi_index,
> > +					op_desc->dependency_count);
> > +	op_desc->dependency_count--;
> > +
> > +	if (op_desc->dependency_count == 0) {
> > +		processor = &engine->processors[op_desc->op_type];
> > +		pr_debug("enable %s in %s as depdency are resolved\n",
> 
> ./nvdla_scheduler.c:455: depdency ==> dependency
> 
> > +			processor->name, __func__);
> > +
> > +		ret = dla_enable_operation(engine, processor, op_desc);
> > +		if (ret)
> > +			goto exit;
> > +	}
> > +exit:
> > +	return ret;
> > +}
> 
> ... snip
> 
> > +int
> > +dla_process_events(struct dla_engine *engine, uint32_t *task_complete)
> > +{
> > +	int32_t i;
> > +	int32_t ret = 0;
> > +
> > +	for (i = 0; i < DLA_OP_NUM; i++) {
> > +		struct dla_processor *processor;
> > +
> > +		processor = &engine->processors[i];
> > +		ret = dla_handle_events(engine, processor);
> > +		/**
> > +		 * Incase engine status is non-zero, then don't
> 
> ./nvdla_scheduler.c:905: Incase ==> In case
> 
> > +		 * update the engine status. We should keep its
> > +		 * status for later cleaning of engine.
> > +		 */
> > +		if (!engine->status)
> > +			engine->status = ret;
> > +	}
> > +
> > +	if (engine->network->num_operations == engine->num_proc_hwl)
> > +		*task_complete = 1;
> > +
> > +	return ret;
> > +}
> 
> ... snip
> 
>   Argillander