The NVIDIA Deep Learning Accelerator (NVDLA) is an open source IP which is integrated into NVIDIA Jetson AGX Xavier, so add driver support for this accelerator. Signed-off-by: Cai Huoqing <cai.huoqing@xxxxxxxxx> --- v1->v2: *Rename nvdla_drm.[ch] to nvdla_drv.[ch] and rename nvdla_ioctl.h to nvdla_drm.h, move it to uapi. comments link: https://lore.kernel.org/lkml/20bac605-97e6-e5cd-c4e4-83a8121645d8@xxxxxxx/ *Remove the onexistent filename in Makefile comments link: https://lore.kernel.org/lkml/202204201512.pp20MXT5-lkp@xxxxxxxxx/ *Sort file names alphabetically in Makefile. *Rearrange the error messages, and use drm_err/_dbg() instead of pr_err/_dbg(). *Replace "dla_" prefix with "nvdla_" *Check the iosys_map by iosys_map_is_null(), and check "ret" directly. *Using iosys_map_memcpy_to/_from() for iosys_map instead of memcpy() *Fix parameter error "dma_buf_vunmap(buf, ptr)", use "&map" instead of "ptr" *Use iosys_map instead of kvaddr and use "iosys_map_set_vaddr()" to initialize iosys_map *Using "vma->vm_pgoff -= drm_vma_node_start(&obj->vma_node)" to update vm_pgoff is cleaner *Remove the unused nvdla_drm_gem_mmap, register drm_gem_mmap to file_operations directly. *Use DEFINE_DRM_GEM_FOPS() to define nvdla_drm_fops. *Remove the unused nvdla_drm_gem_mmap_buf, register drm_gem_prime_mmap to drm_driver directly. comments link: https://lore.kernel.org/lkml/7fa19996-5830-af3d-ab24-08c76e1d5604@xxxxxxx/ *Fix typo and some code style *Remove unused function nvdla_get_time_us() comments link: https://lore.kernel.org/lkml/0fa9ab41-c18e-a569-e6fe-a0e9d965905e@xxxxxxxxxxxxxxxxxxxx/ drivers/gpu/drm/Kconfig | 2 + drivers/gpu/drm/Makefile | 1 + drivers/gpu/drm/nvdla/Kconfig | 8 + drivers/gpu/drm/nvdla/Makefile | 17 + drivers/gpu/drm/nvdla/nvdla_bdma.c | 198 +++++ drivers/gpu/drm/nvdla/nvdla_cache.c | 202 +++++ drivers/gpu/drm/nvdla/nvdla_cdp.c | 299 +++++++ drivers/gpu/drm/nvdla/nvdla_common.c | 293 +++++++ drivers/gpu/drm/nvdla/nvdla_common.h | 835 +++++++++++++++++++ drivers/gpu/drm/nvdla/nvdla_conv.c | 684 +++++++++++++++ drivers/gpu/drm/nvdla/nvdla_drv.c | 694 ++++++++++++++++ drivers/gpu/drm/nvdla/nvdla_drv.h | 129 +++ drivers/gpu/drm/nvdla/nvdla_engine.c | 233 ++++++ drivers/gpu/drm/nvdla/nvdla_engine.h | 272 ++++++ drivers/gpu/drm/nvdla/nvdla_gem.c | 358 ++++++++ drivers/gpu/drm/nvdla/nvdla_pdp.c | 448 ++++++++++ drivers/gpu/drm/nvdla/nvdla_rubik.c | 214 +++++ drivers/gpu/drm/nvdla/nvdla_sched.h | 37 + drivers/gpu/drm/nvdla/nvdla_scheduler.c | 1012 +++++++++++++++++++++++ drivers/gpu/drm/nvdla/nvdla_sdp.c | 723 ++++++++++++++++ 20 files changed, 6659 insertions(+) create mode 100644 drivers/gpu/drm/nvdla/Kconfig create mode 100644 drivers/gpu/drm/nvdla/Makefile create mode 100644 drivers/gpu/drm/nvdla/nvdla_bdma.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_cache.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_cdp.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_common.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_common.h create mode 100644 drivers/gpu/drm/nvdla/nvdla_conv.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_drv.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_drv.h create mode 100644 drivers/gpu/drm/nvdla/nvdla_engine.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_engine.h create mode 100644 drivers/gpu/drm/nvdla/nvdla_gem.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_pdp.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_rubik.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_sched.h create mode 100644 drivers/gpu/drm/nvdla/nvdla_scheduler.c create mode 100644 drivers/gpu/drm/nvdla/nvdla_sdp.c diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 5133c3f028ab..a55cff374abd 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -409,6 +409,8 @@ source "drivers/gpu/drm/solomon/Kconfig" source "drivers/gpu/drm/sprd/Kconfig" +source "drivers/gpu/drm/nvdla/Kconfig" + config DRM_HYPERV tristate "DRM Support for Hyper-V synthetic video device" depends on DRM && PCI && MMU && HYPERV diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile index c2ef5f9fce54..8fa3537f308a 100644 --- a/drivers/gpu/drm/Makefile +++ b/drivers/gpu/drm/Makefile @@ -134,3 +134,4 @@ obj-y += gud/ obj-$(CONFIG_DRM_HYPERV) += hyperv/ obj-y += solomon/ obj-$(CONFIG_DRM_SPRD) += sprd/ +obj-$(CONFIG_DRM_NVDLA) += nvdla/ diff --git a/drivers/gpu/drm/nvdla/Kconfig b/drivers/gpu/drm/nvdla/Kconfig new file mode 100644 index 000000000000..11c04f5da877 --- /dev/null +++ b/drivers/gpu/drm/nvdla/Kconfig @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +config DRM_NVDLA + tristate "NVDLA DRM" + depends on DRM + select DRM_GEM_CMA_HELPER + help + Choose this option for open-source NVIDIA DLA support. + If M is selected the module will be called nvdla-drm. diff --git a/drivers/gpu/drm/nvdla/Makefile b/drivers/gpu/drm/nvdla/Makefile new file mode 100644 index 000000000000..0eae453558a8 --- /dev/null +++ b/drivers/gpu/drm/nvdla/Makefile @@ -0,0 +1,17 @@ + +# SPDX-License-Identifier: GPL-2.0 +nvdla-drm-y := \ + nvdla_bdma.o \ + nvdla_cache.o \ + nvdla_cdp.o \ + nvdla_common.o \ + nvdla_conv.o \ + nvdla_drv.o \ + nvdla_engine.o \ + nvdla_gem.o \ + nvdla_pdp.o \ + nvdla_rubik.o \ + nvdla_scheduler.o \ + nvdla_sdp.o \ + +obj-$(CONFIG_DRM_NVDLA) += nvdla-drm.o diff --git a/drivers/gpu/drm/nvdla/nvdla_bdma.c b/drivers/gpu/drm/nvdla/nvdla_bdma.c new file mode 100644 index 000000000000..c9624d356090 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_bdma.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_common.h" +#include "nvdla_drv.h" +#include "nvdla_reg.h" +#include "nvdla_engine.h" + +static const uint8_t map_mem[] = { + FIELD_ENUM(BDMA_CFG_CMD_0, SRC_RAM_TYPE, MC), + FIELD_ENUM(BDMA_CFG_CMD_0, SRC_RAM_TYPE, CVSRAM), +}; + +void +nvdla_bdma_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id) +{ + /** + * There is no producer bit for BDMA operation, + * interrupt pointer decides which outstanding request + * to use for this BDMA operation + */ +} + +int +nvdla_bdma_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + if (group->surface_desc->bdma_surface.num_transfers == (uint16_t)0) { + group->events |= ((uint8_t)1 << NVDLA_EVENT_OP_COMPLETED); + goto exit; + } + + /* Launch BDMA transfer */ + if (group->id == 0) + bdma_reg_write(engine, CFG_LAUNCH0, FIELD_ENUM(BDMA_CFG_LAUNCH0_0, + GRP0_LAUNCH, YES)); + else + bdma_reg_write(engine, CFG_LAUNCH1, FIELD_ENUM(BDMA_CFG_LAUNCH1_0, + GRP1_LAUNCH, YES)); + +exit: + return 0; +} + +void +nvdla_bdma_rdma_check(struct nvdla_processor_group *group) +{ + group->is_rdma_needed = 0; +} + +/** + * Program BDMA slot for transfer + */ +static int32_t +processor_bdma_program_slot(struct nvdla_engine *engine, + struct nvdla_bdma_surface_desc *bdma_surface, + struct nvdla_bdma_transfer_desc *transfer) +{ + int32_t ret = 0; + uint64_t source_addr = 0; + uint64_t destination_addr = 0; + uint32_t high, low, reg; + uint8_t bdma_free_slots = 0; + + /* make sure there're enough free slots */ + do { + reg = bdma_reg_read(engine, STATUS); + reg = (reg & MASK(BDMA_STATUS_0, FREE_SLOT)) >> + SHIFT(BDMA_STATUS_0, FREE_SLOT); + } while (reg == 0); + bdma_free_slots = (uint8_t)reg; + + nvdla_get_dma_address(engine->driver_context, engine->task->task_data, + transfer->source_address, + (void *)&source_addr, + DESTINATION_DMA); + nvdla_get_dma_address(engine->driver_context, engine->task->task_data, + transfer->destination_address, + (void *)&destination_addr, + DESTINATION_DMA); + + ASSERT_GOTO((transfer->line_repeat <= 8192), + ret, -EINVAL, exit); + ASSERT_GOTO((transfer->surface_repeat <= 8192), + ret, -EINVAL, exit); + ASSERT_GOTO((transfer->line_size % 32) == 0, + ret, -EINVAL, exit); + ASSERT_GOTO(transfer->source_line >= transfer->line_size, + ret, -EINVAL, exit); + ASSERT_GOTO(transfer->destination_line >= transfer->line_size, + ret, -EINVAL, exit); + ASSERT_GOTO(transfer->source_surface >= + (transfer->source_line * transfer->line_repeat), + ret, -EINVAL, exit); + ASSERT_GOTO(transfer->destination_surface >= + (transfer->destination_line * transfer->line_repeat), + ret, -EINVAL, exit); + + /* config registers */ + high = upper_32_bits(source_addr); + low = lower_32_bits(source_addr); + bdma_reg_write(engine, CFG_SRC_ADDR_LOW, low); + bdma_reg_write(engine, CFG_SRC_ADDR_HIGH, high); + high = upper_32_bits(destination_addr); + low = lower_32_bits(destination_addr); + bdma_reg_write(engine, CFG_DST_ADDR_LOW, low); + bdma_reg_write(engine, CFG_DST_ADDR_HIGH, high); + bdma_reg_write(engine, CFG_LINE, (transfer->line_size >> 5) - 1); + reg = (map_mem[bdma_surface->source_type] << + SHIFT(BDMA_CFG_CMD_0, SRC_RAM_TYPE)) | + (map_mem[bdma_surface->destination_type] << + SHIFT(BDMA_CFG_CMD_0, DST_RAM_TYPE)); + bdma_reg_write(engine, CFG_CMD, reg); + bdma_reg_write(engine, CFG_LINE_REPEAT, transfer->line_repeat - 1); + bdma_reg_write(engine, CFG_SRC_LINE, transfer->source_line); + bdma_reg_write(engine, CFG_DST_LINE, transfer->destination_line); + bdma_reg_write(engine, CFG_SURF_REPEAT, transfer->surface_repeat - 1); + bdma_reg_write(engine, CFG_SRC_SURF, transfer->source_surface); + bdma_reg_write(engine, CFG_DST_SURF, transfer->destination_surface); + bdma_reg_write(engine, CFG_OP, FIELD_ENUM(BDMA_CFG_OP_0, EN, ENABLE)); + +exit: + return ret; +} + +int +nvdla_bdma_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group) +{ + struct nvdla_processor_group *next_group; + + next_group = &processor->groups[!group->id]; + + /** + * If another group is already programmed but not active then + * do not program this operation as BDMA does not really + * have shadow copies for groups. It will end programming + * same group. Wait for another group to get enabled. + */ + if ((processor->group_status & (1 << next_group->id)) && + !next_group->active) + return 0; + + return 1; +} + +void +nvdla_bdma_dump_config(struct nvdla_processor_group *group) +{ + struct nvdla_bdma_op_desc *bdma_op; + struct nvdla_bdma_surface_desc *bdma_surface; + + bdma_surface = &group->surface_desc->bdma_surface; + bdma_op = &group->operation_desc->bdma_op; +} + +int +nvdla_bdma_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t i; + int32_t ret = 0; + struct nvdla_bdma_surface_desc *bdma_surface; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + if (!engine->config_data->bdma_enable) { + drm_err(nvdla_dev->drm, "BDMA is not supported for this configuration\n"); + ret = -EINVAL; + goto exit; + } + + bdma_surface = &group->surface_desc->bdma_surface; + + drm_dbg(nvdla_dev->drm, "Num of transfers %u\n", bdma_surface->num_transfers); + if (bdma_surface->num_transfers == (uint16_t)0) + goto exit; + + if (bdma_surface->num_transfers > NUM_MAX_BDMA_OPS) { + drm_err(nvdla_dev->drm, "Invalid number of transfers\n"); + ret = -EINVAL; + goto exit; + } + + for (i = 0; i < bdma_surface->num_transfers; i++) { + ret = processor_bdma_program_slot(engine, bdma_surface, + &bdma_surface->transfers[i]); + if (ret) + goto exit; + } + + nvdla_enable_intr(engine, MASK(GLB_S_INTR_MASK_0, BDMA_DONE_MASK1) | + MASK(GLB_S_INTR_MASK_0, BDMA_DONE_MASK0)); + +exit: + return ret; +} diff --git a/drivers/gpu/drm/nvdla/nvdla_cache.c b/drivers/gpu/drm/nvdla/nvdla_cache.c new file mode 100644 index 000000000000..3c93c0150727 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_cache.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_common.h" +#include "nvdla_drv.h" +#include "nvdla_reg.h" +#include "nvdla_engine.h" + +#define NVDLA_OP_CACHE_SIZE (NVDLA_NUM_GROUPS * ((NVDLA_OP_NUM + 2) * 2)) + +static struct nvdla_common_op_desc desc_cache[NVDLA_OP_NUM][NVDLA_OP_CACHE_SIZE]; +static int32_t desc_refcount[NVDLA_OP_NUM][NVDLA_OP_CACHE_SIZE]; + +void +nvdla_get_refcount(struct nvdla_common_op_desc *op_desc) +{ + int32_t i; + struct nvdla_common_op_desc *desc = NULL; + + if (op_desc == NULL) + return; + + if (op_desc->index == -1) + return; + + desc = &desc_cache[op_desc->op_type][0]; + + for (i = 0; i < NVDLA_OP_CACHE_SIZE; i++, desc++) { + if (desc->index == op_desc->index && + desc->roi_index == op_desc->roi_index) { + desc_refcount[op_desc->op_type][i]++; + return; + } + } +} + +struct nvdla_common_op_desc * +nvdla_get_op_desc(struct nvdla_engine *engine, + struct nvdla_task_desc *task, int16_t index, + uint8_t op_type, uint8_t roi_index) +{ + int32_t i; + int32_t ret; + uint64_t op_base; + uint64_t dep_graph_addr; + struct nvdla_common_op_desc *desc = NULL; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + if (index == -1) { + drm_dbg(nvdla_dev->drm, "no desc get due to index==-1\n"); + goto exit; + } + + dep_graph_addr = (sizeof(struct nvdla_common_op_desc) * + engine->network->num_operations * roi_index); + + desc = &desc_cache[op_type][0]; + + for (i = 0; i < NVDLA_OP_CACHE_SIZE; i++, desc++) { + if (desc->index == index && desc->roi_index == roi_index) { + if (desc->op_type != op_type) { + drm_err(nvdla_dev->drm, + "op_cache[op=%u] contains incorrect entry of op[%u]\n", + op_type, desc->op_type); + continue; + } + desc_refcount[op_type][i]++; + goto exit; + } + } + + desc = &desc_cache[op_type][0]; + + for (i = 0; i < NVDLA_OP_CACHE_SIZE; i++, desc++) { + if (desc->index == -1) { + op_base = dep_graph_addr + + (sizeof(struct nvdla_common_op_desc) * + (uint64_t)index); + ret = nvdla_data_read(engine->driver_context, + task->task_data, + task->dependency_graph_addr, + (void *)(desc), + sizeof(struct nvdla_common_op_desc), + op_base); + if (ret) { + desc = NULL; + goto exit; + } + + if (op_type != desc->op_type) { + /* + * op_type of entry read from DRAM should not + * mismatch with given op_type. If they + * mismatches, then wrong entry is fetched, so + * report this issue by throwing error. + */ + drm_err(nvdla_dev->drm, + "Fetched [op_type=%u] from DRAM doesn't match with op_type[%u]\n", + desc->op_type, op_type); + desc->op_type = op_type; + desc->index = -1; + desc->roi_index = -1; + desc = NULL; + goto exit; + } + + desc->index = index; + desc->roi_index = roi_index; + + desc_refcount[op_type][i]++; + goto exit; + } + } + +exit: + return desc; +} + +static void +nvdla_free_op_desc(struct nvdla_engine *engine, struct nvdla_common_op_desc *op_desc) +{ + uint64_t op_base; + uint64_t dep_graph_addr; + struct nvdla_task_desc *task; + + task = engine->task; + dep_graph_addr = (sizeof(struct nvdla_common_op_desc) * + engine->network->num_operations * + op_desc->roi_index); + + /** + * TODO: keeping the depth value hardcoded as 0 for now, + * need to replace it once corresponding implementation is done. + */ + op_base = (dep_graph_addr + + (sizeof(struct nvdla_common_op_desc) * + (uint64_t)op_desc->index)); + + /* Flush descriptor to DRAM */ + nvdla_data_write(engine->driver_context, + task->task_data, + (void *)op_desc, + task->dependency_graph_addr, + sizeof(struct nvdla_common_op_desc), + op_base); + + /* Release it */ + op_desc->index = -1; + op_desc->roi_index = -1; +} + +void +nvdla_put_op_desc(struct nvdla_engine *engine, struct nvdla_common_op_desc *op_desc) +{ + int32_t i; + struct nvdla_common_op_desc *desc; + + if (op_desc == NULL) + return; + + if (op_desc->index == -1) + return; + + desc = &desc_cache[op_desc->op_type][0]; + + for (i = 0; i < NVDLA_OP_CACHE_SIZE; i++, desc++) { + if (desc->index == op_desc->index && + desc->roi_index == op_desc->roi_index) { + + desc_refcount[op_desc->op_type][i]--; + + /* Free desc if refcount is 0 */ + if (desc_refcount[op_desc->op_type][i] == 0) + nvdla_free_op_desc(engine, op_desc); + + return; + } + } +} + +void +nvdla_init_op_cache(struct nvdla_engine *engine) +{ + int32_t i, j; + struct nvdla_common_op_desc *desc = &desc_cache[0][0]; + + memset((uint8_t *)&desc_cache[0][0], 0, sizeof(desc_cache)); + memset((uint8_t *)&desc_refcount[0][0], 0, sizeof(desc_refcount)); + + for (i = 0; i < NVDLA_OP_NUM; i++) { + for (j = 0; j < NVDLA_OP_CACHE_SIZE; j++) { + desc->index = -1; + desc->roi_index = -1; + desc->op_type = (uint8_t)i; + desc++; + } + } +} diff --git a/drivers/gpu/drm/nvdla/nvdla_cdp.c b/drivers/gpu/drm/nvdla/nvdla_cdp.c new file mode 100644 index 000000000000..41a6cace48a1 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_cdp.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_common.h" +#include "nvdla_drv.h" +#include "nvdla_reg.h" +#include "nvdla_engine.h" + +static const uint8_t map_ram[] = { + FIELD_ENUM(CDP_RDMA_D_SRC_DMA_CFG_0, SRC_RAM_TYPE, MC), + FIELD_ENUM(CDP_RDMA_D_SRC_DMA_CFG_0, SRC_RAM_TYPE, CV), +}; + +static const uint8_t map_precision[] = { + FIELD_ENUM(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA, INT8), + FIELD_ENUM(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA, INT16), + FIELD_ENUM(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA, FP16), +}; + +static const uint8_t map_perf_dma[] = { + FIELD_ENUM(CDP_D_PERF_ENABLE_0, DMA_EN, DISABLE), + FIELD_ENUM(CDP_D_PERF_ENABLE_0, DMA_EN, ENABLE), +}; + +static const uint8_t map_perf_lut[] = { + FIELD_ENUM(CDP_D_PERF_ENABLE_0, LUT_EN, DISABLE), + FIELD_ENUM(CDP_D_PERF_ENABLE_0, LUT_EN, ENABLE), +}; + + +static uint32_t +map_local_size(uint8_t local_size) +{ + return ((local_size-1)/2)-1; +} + +void +nvdla_cdp_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id) +{ + uint32_t reg; + + /** + * set producer pointer for all sub-modules + */ + reg = group_id << SHIFT(CDP_S_POINTER_0, PRODUCER); + cdp_reg_write(engine, S_POINTER, reg); + reg = group_id << SHIFT(CDP_RDMA_S_POINTER_0, PRODUCER); + cdp_rdma_reg_write(engine, S_POINTER, reg); +} + +int +nvdla_cdp_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + uint32_t reg; + + reg = FIELD_ENUM(CDP_RDMA_D_OP_ENABLE_0, OP_EN, ENABLE); + cdp_rdma_reg_write(engine, D_OP_ENABLE, reg); + reg = FIELD_ENUM(CDP_D_OP_ENABLE_0, OP_EN, ENABLE); + cdp_reg_write(engine, D_OP_ENABLE, reg); + + return 0; +} + +void +nvdla_cdp_rdma_check(struct nvdla_processor_group *group) +{ + group->is_rdma_needed = 1; +} + +static int32_t +processor_cdp_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret = 0; + uint32_t reg, high, low; + uint64_t input_address = 0; + uint64_t output_address = 0; + struct nvdla_lut_param lut; + struct nvdla_cdp_op_desc *cdp_op; + struct nvdla_cdp_surface_desc *cdp_surface; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + cdp_op = &group->operation_desc->cdp_op; + cdp_surface = &group->surface_desc->cdp_surface; + + /* Argument check */ + if (cdp_surface->src_data.type == NVDLA_MEM_HW) { + drm_err(nvdla_dev->drm, "Invalid source memory type\n"); + ret = -EINVAL; + goto exit; + } + if (cdp_surface->dst_data.type == NVDLA_MEM_HW) { + drm_err(nvdla_dev->drm, "Invalid destination memory type\n"); + ret = -EINVAL; + goto exit; + } + + if (cdp_op->in_precision != cdp_op->out_precision) { + drm_err(nvdla_dev->drm, "CDP does not support precision conversion\n"); + ret = -EINVAL; + goto exit; + } + + /* get the addresses from task descriptor */ + ret = nvdla_read_input_address(engine, &cdp_surface->src_data, + &input_address, + group->op_desc->index, + group->roi_index, 1); + if (ret) + goto exit; + + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + cdp_surface->dst_data.address, + cdp_surface->dst_data.offset, + (void *)&output_address, + DESTINATION_DMA); + if (cdp_op->lut_index >= 0) { + group->lut_index = cdp_op->lut_index; + nvdla_read_lut(engine, cdp_op->lut_index, (void *)&lut); + } + + /* config CDP RDMA registers */ + reg = ((cdp_surface->src_data.width - 1) + << SHIFT(CDP_RDMA_D_DATA_CUBE_WIDTH_0, WIDTH)); + cdp_rdma_reg_write(engine, D_DATA_CUBE_WIDTH, reg); + + reg = ((cdp_surface->src_data.height - 1) + << SHIFT(CDP_RDMA_D_DATA_CUBE_HEIGHT_0, HEIGHT)); + cdp_rdma_reg_write(engine, D_DATA_CUBE_HEIGHT, reg); + + reg = ((cdp_surface->src_data.channel - 1) + << SHIFT(CDP_RDMA_D_DATA_CUBE_CHANNEL_0, CHANNEL)); + cdp_rdma_reg_write(engine, D_DATA_CUBE_CHANNEL, reg); + + high = upper_32_bits(input_address); + low = lower_32_bits(input_address); + cdp_rdma_reg_write(engine, D_SRC_BASE_ADDR_LOW, low); + cdp_rdma_reg_write(engine, D_SRC_BASE_ADDR_HIGH, high); + + cdp_rdma_reg_write(engine, D_SRC_LINE_STRIDE, + cdp_surface->src_data.line_stride); + cdp_rdma_reg_write(engine, D_SRC_SURFACE_STRIDE, + cdp_surface->src_data.surf_stride); + + reg = (map_ram[cdp_surface->src_data.type] + << SHIFT(CDP_RDMA_D_SRC_DMA_CFG_0, SRC_RAM_TYPE)); + cdp_rdma_reg_write(engine, D_SRC_DMA_CFG, reg); + + reg = (map_precision[cdp_op->in_precision] + << SHIFT(CDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA)); + cdp_rdma_reg_write(engine, D_DATA_FORMAT, reg); + + /* config CDP */ + if (cdp_op->lut_index >= 0) + update_lut(engine, CDP_S_LUT_ACCESS_CFG_0, &lut, cdp_op->in_precision); + + high = upper_32_bits(output_address); + low = lower_32_bits(output_address); + cdp_reg_write(engine, D_DST_BASE_ADDR_LOW, low); + cdp_reg_write(engine, D_DST_BASE_ADDR_HIGH, high); + + cdp_reg_write(engine, D_DST_LINE_STRIDE, cdp_surface->dst_data.line_stride); + cdp_reg_write(engine, D_DST_SURFACE_STRIDE, cdp_surface->dst_data.surf_stride); + + reg = (map_ram[cdp_surface->dst_data.type] + << SHIFT(CDP_D_DST_DMA_CFG_0, DST_RAM_TYPE)); + cdp_reg_write(engine, D_DST_DMA_CFG, reg); + + reg = (map_precision[cdp_op->in_precision] + << SHIFT(CDP_D_DATA_FORMAT_0, INPUT_DATA_TYPE)); + cdp_reg_write(engine, D_DATA_FORMAT, reg); + + reg = (map_local_size(cdp_op->local_size) + << SHIFT(CDP_D_LRN_CFG_0, NORMALZ_LEN)); + cdp_reg_write(engine, D_LRN_CFG, reg); + + reg = (cdp_op->in_cvt.offset + << SHIFT(CDP_D_DATIN_OFFSET_0, DATIN_OFFSET)); + cdp_reg_write(engine, D_DATIN_OFFSET, reg); + + reg = (cdp_op->in_cvt.scale + << SHIFT(CDP_D_DATIN_SCALE_0, DATIN_SCALE)); + cdp_reg_write(engine, D_DATIN_SCALE, reg); + + reg = (cdp_op->in_cvt.truncate + << SHIFT(CDP_D_DATIN_SHIFTER_0, DATIN_SHIFTER)); + cdp_reg_write(engine, D_DATIN_SHIFTER, reg); + + reg = (cdp_op->out_cvt.offset + << SHIFT(CDP_D_DATOUT_OFFSET_0, DATOUT_OFFSET)); + cdp_reg_write(engine, D_DATOUT_OFFSET, reg); + + reg = (cdp_op->out_cvt.scale + << SHIFT(CDP_D_DATOUT_SCALE_0, DATOUT_SCALE)); + cdp_reg_write(engine, D_DATOUT_SCALE, reg); + + reg = (cdp_op->out_cvt.truncate + << SHIFT(CDP_D_DATOUT_SHIFTER_0, DATOUT_SHIFTER)); + cdp_reg_write(engine, D_DATOUT_SHIFTER, reg); + + reg = ((cdp_op->bypass_sqsum ? + FIELD_ENUM(CDP_D_FUNC_BYPASS_0, SQSUM_BYPASS, ENABLE) : + FIELD_ENUM(CDP_D_FUNC_BYPASS_0, SQSUM_BYPASS, DISABLE)) << + SHIFT(CDP_D_FUNC_BYPASS_0, SQSUM_BYPASS)) | + ((cdp_op->bypass_out_mul ? + FIELD_ENUM(CDP_D_FUNC_BYPASS_0, MUL_BYPASS, ENABLE) : + FIELD_ENUM(CDP_D_FUNC_BYPASS_0, MUL_BYPASS, DISABLE)) << + SHIFT(CDP_D_FUNC_BYPASS_0, MUL_BYPASS)); + cdp_reg_write(engine, D_FUNC_BYPASS, reg); + +exit: + return ret; +} + +int +nvdla_cdp_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group) +{ + struct nvdla_processor_group *next_group; + struct nvdla_cdp_op_desc *cdp_op; + + cdp_op = &group->operation_desc->cdp_op; + next_group = &processor->groups[!group->id]; + + /** + * Single LUT is shared between two CDP groups, need to make + * sure that usage does not conflict. Also, LUT write + * access is locked when CDP sub-engine is active, so delay + * writing LUT when another group is active. + */ + + /** + * if no LUT required for current group then it can be programmed + * without further checks + */ + if (cdp_op->lut_index == -1) + return 1; + + /** + * if same LUT is used for both groups then it can be programmed + * without more checks. Even if another group is active and LUT + * is locked, it would have been programmed by another group. + */ + if (next_group->lut_index == cdp_op->lut_index) + return 1; + + /** + * if LUT index of another group is not -1 means some LUT is programmed, + * then do not program current LUT as we already know current LUT is not + * -1 and neither same as another group. + */ + if (next_group->lut_index != -1) + return 0; + + /** + * if current group needs LUT different than another group and that + * group is not active then program it. + */ + if (!next_group->active) + return 1; + + /** + * if control is here it means current group is using LUT different than + * another group and that group is active. Wait for another group to + * become idle. + */ + + return 0; +} + +void +nvdla_cdp_dump_config(struct nvdla_processor_group *group) +{ + struct nvdla_cdp_op_desc *cdp_op; + struct nvdla_cdp_surface_desc *cdp_surface; + + cdp_surface = &group->surface_desc->cdp_surface; + cdp_op = &group->operation_desc->cdp_op; +} + +int +nvdla_cdp_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret; + + nvdla_enable_intr(engine, MASK(GLB_S_INTR_MASK_0, CDP_DONE_MASK1) | + MASK(GLB_S_INTR_MASK_0, CDP_DONE_MASK0)); + + ret = processor_cdp_program(engine, group); + if (ret) + goto exit; + +exit: + return ret; +} diff --git a/drivers/gpu/drm/nvdla/nvdla_common.c b/drivers/gpu/drm/nvdla/nvdla_common.c new file mode 100644 index 000000000000..7606f9f33b6f --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_common.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_reg.h" +#include "nvdla_common.h" +#include "nvdla_drv.h" +#include "nvdla_engine.h" + +static const uint8_t map_lut_method[] = { + FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_LE_FUNCTION, EXPONENT), + FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_LE_FUNCTION, LINEAR) +}; +static const uint8_t map_lut_out[] = { + FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_UFLOW_PRIORITY, LE), + FIELD_ENUM(CDP_S_LUT_CFG_0, LUT_UFLOW_PRIORITY, LO) +}; + +static const uint16_t access_data_offset[] = { + CDP_S_LUT_ACCESS_DATA_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_ACCESS_DATA_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t lut_cfg_offset[] = { + CDP_S_LUT_CFG_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_CFG_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t lut_info_offset[] = { + CDP_S_LUT_INFO_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_INFO_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t le_start_offset[] = { + CDP_S_LUT_LE_START_LOW_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_LE_START_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t le_end_offset[] = { + CDP_S_LUT_LE_END_LOW_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_LE_END_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t lo_start_offset[] = { + CDP_S_LUT_LO_START_LOW_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_LO_START_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t lo_end_offset[] = { + CDP_S_LUT_LO_END_LOW_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_LO_END_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t le_slope_scale_offset[] = { + CDP_S_LUT_LE_SLOPE_SCALE_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_LE_SLOPE_SCALE_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t le_slope_shift_offset[] = { + CDP_S_LUT_LE_SLOPE_SHIFT_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_LE_SLOPE_SHIFT_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t lo_slope_scale_offset[] = { + CDP_S_LUT_LO_SLOPE_SCALE_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_LO_SLOPE_SCALE_0 - SDP_S_LUT_ACCESS_CFG_0, +}; +static const uint16_t lo_slope_shift_offset[] = { + CDP_S_LUT_LO_SLOPE_SHIFT_0 - CDP_S_LUT_ACCESS_CFG_0, + SDP_S_LUT_LO_SLOPE_SHIFT_0 - SDP_S_LUT_ACCESS_CFG_0, +}; + +void update_lut(struct nvdla_engine *engine, uint32_t reg_base, + struct nvdla_lut_param *lut, uint8_t precision) +{ + int32_t i; + uint32_t reg; + uint32_t high, low; + int32_t is_sdp = reg_base == SDP_S_LUT_ACCESS_CFG_0; + + /* program raw table */ + reg = (FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID, LE) + << SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID)) | + (FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE, WRITE) + << SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE)); + reg_write(engine, reg_base, reg); + + for (i = 0; i < (1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1; i++) { + nvdla_reg_write(engine->driver_context, + reg_base + access_data_offset[is_sdp], + lut->linear_exp_table[i]); + } + + /* program density table */ + reg = (FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID, LO) + << SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_TABLE_ID)) | + (FIELD_ENUM(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE, WRITE) + << SHIFT(CDP_S_LUT_ACCESS_CFG_0, LUT_ACCESS_TYPE)); + nvdla_reg_write(engine->driver_context, reg_base, reg); + + for (i = 0; i < (1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1; i++) { + nvdla_reg_write(engine->driver_context, + reg_base + access_data_offset[is_sdp], + lut->linear_only_table[i]); + } + + /* program other configurations */ + reg = (map_lut_method[lut->method] << + SHIFT(CDP_S_LUT_CFG_0, LUT_LE_FUNCTION)) | + (map_lut_out[lut->hybrid_priority] << + SHIFT(CDP_S_LUT_CFG_0, LUT_HYBRID_PRIORITY)) | + (map_lut_out[lut->underflow_priority] << + SHIFT(CDP_S_LUT_CFG_0, LUT_UFLOW_PRIORITY)) | + (map_lut_out[lut->overflow_priority] << + SHIFT(CDP_S_LUT_CFG_0, LUT_OFLOW_PRIORITY)); + nvdla_reg_write(engine->driver_context, + reg_base + lut_cfg_offset[is_sdp], reg); + + if (lut->method == FIELD_ENUM(CDP_S_LUT_CFG_0, + LUT_LE_FUNCTION, EXPONENT)) { + reg = ((((uint32_t)lut->linear_exp_offset.exp_offset) << + SHIFT(CDP_S_LUT_INFO_0, LUT_LE_INDEX_OFFSET))& + MASK(CDP_S_LUT_INFO_0, LUT_LE_INDEX_OFFSET)) | + ((((uint32_t)lut->linear_only_offset.frac_bits) << + SHIFT(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT))& + MASK(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT)); + } else { + reg = ((((uint32_t)lut->linear_exp_offset.frac_bits) << + SHIFT(CDP_S_LUT_INFO_0, LUT_LE_INDEX_SELECT))& + MASK(CDP_S_LUT_INFO_0, LUT_LE_INDEX_SELECT)) | + ((((uint32_t)lut->linear_only_offset.frac_bits) << + SHIFT(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT))& + MASK(CDP_S_LUT_INFO_0, LUT_LO_INDEX_SELECT)); + } + nvdla_reg_write(engine->driver_context, + reg_base + lut_info_offset[is_sdp], reg); + high = upper_32_bits(lut->linear_exp_start); + low = lower_32_bits(lut->linear_exp_start); + nvdla_reg_write(engine->driver_context, + reg_base + le_start_offset[is_sdp], low); + if (!is_sdp) + nvdla_reg_write(engine->driver_context, + reg_base + le_start_offset[is_sdp] + 4, high); + + high = upper_32_bits(lut->linear_exp_end); + low = lower_32_bits(lut->linear_exp_end); + nvdla_reg_write(engine->driver_context, + reg_base + le_end_offset[is_sdp], low); + if (!is_sdp) + nvdla_reg_write(engine->driver_context, + reg_base + le_end_offset[is_sdp] + 4, high); + + high = upper_32_bits(lut->linear_only_start); + low = lower_32_bits(lut->linear_only_start); + nvdla_reg_write(engine->driver_context, + reg_base + lo_start_offset[is_sdp], low); + if (!is_sdp) + nvdla_reg_write(engine->driver_context, + reg_base + lo_start_offset[is_sdp] + 4, high); + + high = upper_32_bits(lut->linear_only_end); + low = lower_32_bits(lut->linear_only_end); + nvdla_reg_write(engine->driver_context, + reg_base + lo_end_offset[is_sdp], low); + if (!is_sdp) + nvdla_reg_write(engine->driver_context, + reg_base + lo_end_offset[is_sdp] + 4, high); + + if (precision == PRECISION_FP16) { + reg = (lut->linear_exp_underflow_slope.data_f << + SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0, + LUT_LE_SLOPE_UFLOW_SCALE)) | + (lut->linear_exp_overflow_slope.data_f << + SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0, + LUT_LE_SLOPE_OFLOW_SCALE)); + nvdla_reg_write(engine->driver_context, + reg_base + le_slope_scale_offset[is_sdp], reg); + + reg = (lut->linear_only_underflow_slope.data_f << + SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0, + LUT_LO_SLOPE_UFLOW_SCALE)) | + (lut->linear_only_overflow_slope.data_f << + SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0, + LUT_LO_SLOPE_OFLOW_SCALE)); + nvdla_reg_write(engine->driver_context, + reg_base + lo_slope_scale_offset[is_sdp], reg); + } else { + union nvdla_slope *oslope; + union nvdla_slope *uslope; + + uslope = &lut->linear_exp_underflow_slope; + oslope = &lut->linear_exp_overflow_slope; + reg = ((((uint32_t)uslope->data_i.scale) + << SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0, + LUT_LE_SLOPE_UFLOW_SCALE))& + MASK(CDP_S_LUT_LE_SLOPE_SCALE_0, + LUT_LE_SLOPE_UFLOW_SCALE)) | + ((((uint32_t)oslope->data_i.scale) + << SHIFT(CDP_S_LUT_LE_SLOPE_SCALE_0, + LUT_LE_SLOPE_OFLOW_SCALE))& + MASK(CDP_S_LUT_LE_SLOPE_SCALE_0, + LUT_LE_SLOPE_OFLOW_SCALE)); + nvdla_reg_write(engine->driver_context, + reg_base + le_slope_scale_offset[is_sdp], reg); + + reg = ((((uint32_t)uslope->data_i.shifter) << + SHIFT(CDP_S_LUT_LE_SLOPE_SHIFT_0, + LUT_LE_SLOPE_UFLOW_SHIFT))& + MASK(CDP_S_LUT_LE_SLOPE_SHIFT_0, + LUT_LE_SLOPE_UFLOW_SHIFT)) | + ((((uint32_t)oslope->data_i.shifter) << + SHIFT(CDP_S_LUT_LE_SLOPE_SHIFT_0, + LUT_LE_SLOPE_OFLOW_SHIFT))& + MASK(CDP_S_LUT_LE_SLOPE_SHIFT_0, + LUT_LE_SLOPE_OFLOW_SHIFT)); + nvdla_reg_write(engine->driver_context, + reg_base + le_slope_shift_offset[is_sdp], reg); + + uslope = &lut->linear_only_underflow_slope; + oslope = &lut->linear_only_overflow_slope; + reg = ((((uint32_t)uslope->data_i.scale) << + SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0, + LUT_LO_SLOPE_UFLOW_SCALE))& + MASK(CDP_S_LUT_LO_SLOPE_SCALE_0, + LUT_LO_SLOPE_UFLOW_SCALE)) | + ((((uint32_t)oslope->data_i.scale) << + SHIFT(CDP_S_LUT_LO_SLOPE_SCALE_0, + LUT_LO_SLOPE_OFLOW_SCALE))& + MASK(CDP_S_LUT_LO_SLOPE_SCALE_0, + LUT_LO_SLOPE_OFLOW_SCALE)); + nvdla_reg_write(engine->driver_context, + reg_base + lo_slope_scale_offset[is_sdp], reg); + reg = ((((uint32_t)uslope->data_i.shifter) << + SHIFT(CDP_S_LUT_LO_SLOPE_SHIFT_0, + LUT_LO_SLOPE_UFLOW_SHIFT))& + MASK(CDP_S_LUT_LO_SLOPE_SHIFT_0, + LUT_LO_SLOPE_UFLOW_SHIFT)) | + ((((uint32_t)oslope->data_i.shifter) << + SHIFT(CDP_S_LUT_LO_SLOPE_SHIFT_0, + LUT_LO_SLOPE_OFLOW_SHIFT))& + MASK(CDP_S_LUT_LO_SLOPE_SHIFT_0, + LUT_LO_SLOPE_OFLOW_SHIFT)); + nvdla_reg_write(engine->driver_context, + reg_base + lo_slope_shift_offset[is_sdp], reg); + } +} + +int +validate_data_cube(struct nvdla_data_cube src_data_cube, + struct nvdla_data_cube dst_data_cube, + uint8_t mem_type) +{ + int32_t ret = 0; + + if ((src_data_cube.width > DCUBE_MAX_WIDTH) || + (src_data_cube.height > DCUBE_MAX_HEIGHT) || + (src_data_cube.channel > DCUBE_MAX_CHANNEL)) { + pr_err("Invalid SrcInput Cude[W: %u, H: %u, C: %u]", + src_data_cube.width, src_data_cube.height, src_data_cube.channel); + ret = -EINVAL; + goto exit; + } + + if ((dst_data_cube.width > DCUBE_MAX_WIDTH) || + (dst_data_cube.height > DCUBE_MAX_HEIGHT) || + (dst_data_cube.channel > DCUBE_MAX_CHANNEL)) { + pr_err("Invalid DstInput Cude[W: %u, H: %u, C: %u]", + dst_data_cube.width, dst_data_cube.height, dst_data_cube.channel); + ret = -EINVAL; + goto exit; + } + + if (src_data_cube.type > mem_type) { + pr_err("Invalid src_data.mem_type: %u\n", src_data_cube.type); + ret = -EINVAL; + goto exit; + } + + if (dst_data_cube.type > mem_type) { + pr_err("Invalid dst_data.mem_type: %u\n", dst_data_cube.type); + ret = -EINVAL; + goto exit; + } + +exit: + return ret; +} + +int +validate_precision(uint8_t precision, uint8_t map_precision) +{ + int32_t ret = 0; + + if (precision >= map_precision) { + pr_err("Invalid precision: %u\n", precision); + ret = -EINVAL; + } + + return ret; +} diff --git a/drivers/gpu/drm/nvdla/nvdla_common.h b/drivers/gpu/drm/nvdla/nvdla_common.h new file mode 100644 index 000000000000..e036c69e2981 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_common.h @@ -0,0 +1,835 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION. + * Copyright (C) 2022 Cai Huoqing + */ + +#ifndef __NVDLA_COMMON_H_ +#define __NVDLA_COMMON_H_ + +#include <linux/types.h> + +#define DCUBE_MAX_WIDTH 8192 +#define DCUBE_MAX_HEIGHT 8192 +#define DCUBE_MAX_CHANNEL 8192 + +/** + * @ingroup Processors + * @name DLA Processors + * Processor modules in DLA engine. Each processor has it's + * own operation a.k.a. HW layer. Network is formed using + * graph of these operations + * @{ + */ +#define NVDLA_OP_BDMA 0 +#define NVDLA_OP_CONV 1 +#define NVDLA_OP_SDP 2 +#define NVDLA_OP_PDP 3 +#define NVDLA_OP_CDP 4 +#define NVDLA_OP_RUBIK 5 +/** @} */ + +/** + * @ingroup Processors + * @name Maximum number of processors + * @brief DLA ash 6 processors + * @{ + */ +#define NVDLA_OP_NUM 6 +/** @} */ + +/** + * @ingroup Processors + * @name Number of groups + * @brief Each processor has 2 groups of registers + * @{ + */ +#define NVDLA_NUM_GROUPS 2 +/** @} */ + +/** + * Network descriptor + * + * Contains all information to execute a network + * + * @op_head: Index of first operation of each type in operations list + * @num_rois: Number of ROIs + * @num_operations: Number of operations in one list + * @num_luts: Number of LUTs + */ +struct nvdla_network_desc { + int16_t operation_desc_index; + int16_t surface_desc_index; + + int16_t dependency_graph_index; + int16_t lut_data_index; + + int16_t roi_array_index; + int16_t surface_index; + + int16_t stat_list_index; + int16_t reserved1; + + int16_t op_head[NVDLA_OP_NUM]; + + uint16_t num_rois; + uint16_t num_operations; + + uint16_t num_luts; + uint16_t num_addresses; + + int16_t input_layer; + uint8_t dynamic_roi; + uint8_t reserved0; +} __packed __aligned(4); + +/** + * @name Memory types + * @brief DLA engnine can read/write to/from 3 memory types + * @{ + */ +#define NVDLA_MEM_MC 0 /* External DRAM */ +#define NVDLA_MEM_CV 1 /* CV-SRAM */ +#define NVDLA_MEM_HW 2 /* DLA sub-module */ +/** @} */ + +/** + * @ingroup Events + * @name Operation events + * @brief Different events triggered by an operations + * @{ + */ +#define NVDLA_EVENT_OP_COMPLETED 1 +#define NVDLA_EVENT_OP_PROGRAMMED 2 +#define NVDLA_EVENT_OP_ENABLED 3 +#define NVDLA_EVENT_CDMA_WT_DONE 4 +#define NVDLA_EVENT_CDMA_DT_DONE 5 +/** @} */ + +struct nvdla_consumer { + int16_t index; /* the index of nvdla_common_op_desc in dep_graph_addr */ + uint8_t event; + uint8_t res; +} __packed __aligned(4); + +struct nvdla_common_op_desc { + int16_t index; /* set by ucode */ + int8_t roi_index; + uint8_t op_type; + + uint8_t dependency_count; + uint8_t reserved0[3]; + + struct nvdla_consumer consumers[NVDLA_OP_NUM]; + struct nvdla_consumer fused_parent; +} __packed __aligned(4); + +struct nvdla_roi_array_desc { + uint32_t array_length; + + uint32_t array_reserved; +} __packed __aligned(4); + +struct nvdla_roi_desc { + uint32_t left; + + uint32_t top; + + uint32_t right; + + uint32_t bottom; +} __packed __aligned(4); + +/** + * @ingroup BDMA + * @name Maximum BDMA transfers + * @brief BDMA supports multiple transfers in operation. This indicates + * maximum number of transfers possible in one operation. + * @{ + */ +#define NUM_MAX_BDMA_OPS 20 +/** @} */ + +struct nvdla_bdma_transfer_desc { + int16_t source_address; + int16_t destination_address; + + uint32_t line_size; + + uint32_t line_repeat; + + uint32_t source_line; + + uint32_t destination_line; + + uint32_t surface_repeat; + + uint32_t source_surface; + + uint32_t destination_surface; +} __packed __aligned(4); + +struct nvdla_bdma_surface_desc { + uint8_t source_type; + uint8_t destination_type; + uint16_t num_transfers; + + struct nvdla_bdma_transfer_desc transfers[NUM_MAX_BDMA_OPS]; +} __packed __aligned(4); + +struct nvdla_bdma_op_desc { + uint16_t num_transfers; + uint16_t reserved0; +} __packed __aligned(4); + +struct nvdla_bdma_stat_desc { + uint32_t read_stall; + uint32_t write_stall; +} __packed __aligned(4); + +/** + * @ingroup Convolution + * @name Convolution mode + * @brief Convolution modes support by DLA + * @{ + */ +#define CONV_MODE_DIRECT 0 +#define CONV_MODE_WINOGRAD 1 +/** @} */ + +/** + * @ingroup Processors + * @name Precision BPE mapping + * @brief Precision formats and Bit Per Elements mapping + * @{ + */ +#define BPE_PRECISION_INT8 1 +#define BPE_PRECISION_INT16 2 +#define BPE_PRECISION_FP16 2 +/** @} */ + + +/** + * @ingroup Processors + * @name Precision types + * @brief Precision formats supported by DLA engine + * @{ + */ +#define PRECISION_INT8 0 +#define PRECISION_INT16 1 +#define PRECISION_FP16 2 +/** @} */ + +/** + * @ingroup Processors + * @name Data formats + * @brief Data formats supported by DLA engine + * @{ + */ +#define FORMAT_T_R8 0 +#define FORMAT_T_R10 1 +#define FORMAT_T_R12 2 +#define FORMAT_T_R16 3 +#define FORMAT_T_R16_I 4 +#define FORMAT_T_R16_F 5 +#define FORMAT_T_A16B16G16R16 6 +#define FORMAT_T_X16B16G16R16 7 +#define FORMAT_T_A16B16G16R16_F 8 +#define FORMAT_T_A16Y16U16V16 9 +#define FORMAT_T_V16U16Y16A16 10 +#define FORMAT_T_A16Y16U16V16_F 11 +#define FORMAT_T_A8B8G8R8 12 +#define FORMAT_T_A8R8G8B8 13 +#define FORMAT_T_B8G8R8A8 14 +#define FORMAT_T_R8G8B8A8 15 +#define FORMAT_T_X8B8G8R8 16 +#define FORMAT_T_X8R8G8B8 17 +#define FORMAT_T_B8G8R8X8 18 +#define FORMAT_T_R8G8B8X8 19 +#define FORMAT_T_A2B10G10R10 20 +#define FORMAT_T_A2R10G10B10 21 +#define FORMAT_T_B10G10R10A2 22 +#define FORMAT_T_R10G10B10A2 23 +#define FORMAT_T_A2Y10U10V10 24 +#define FORMAT_T_V10U10Y10A2 25 +#define FORMAT_T_A8Y8U8V8 26 +#define FORMAT_T_V8U8Y8A8 27 +#define FORMAT_T_Y8___U8V8_N444 28 +#define FORMAT_T_Y8___V8U8_N444 29 +#define FORMAT_T_Y10___U10V10_N444 30 +#define FORMAT_T_Y10___V10U10_N444 31 +#define FORMAT_T_Y12___U12V12_N444 32 +#define FORMAT_T_Y12___V12U12_N444 33 +#define FORMAT_T_Y16___U16V16_N444 34 +#define FORMAT_T_Y16___V16U16_N444 35 +#define FORMAT_FEATURE 36 +/** @} */ + +/** + * @ingroup Convolution + * @name Pixel mapping + * @brief Pixel mapping formats supported for image input in Convolution + * @{ + */ +#define MAP_PITCH_LINEAR 0 +/** @} */ + +/** + * @ingroup Convolution + * @name Weight formats + * @brief Weight data formats supported in Convolution + * @{ + */ +#define WEIGHT_FORMAT_UNCOMPRESSED 0 +#define WEIGHT_FORMAT_COMPRESSED 1 +/** @} */ + +/** + * @ingroup Convolution + * @name Mean data format + * @brief Mean data formats supported in Convolution + * @{ + */ +#define MEAN_FORMAT_DISABLE 0 +#define MEAN_FORMAT_ENABLE 1 +/** @} */ + +struct nvdla_cvt_param { + int16_t scale; + uint8_t truncate; + uint8_t enable; + + int32_t offset; +} __packed __aligned(4); + +struct nvdla_data_cube { + uint16_t type; /* nvdla_mem_type */ + int16_t address; /* offset to the actual IOVA in task.address_list */ + + uint32_t offset; /* offset within address */ + uint32_t size; + + /* cube dimensions */ + uint16_t width; + uint16_t height; + + uint16_t channel; + uint16_t reserved0; + + /* stride information */ + uint32_t line_stride; + uint32_t surf_stride; + + /* For Rubik only */ + uint32_t plane_stride; +} __packed __aligned(4); + +#define PIXEL_OVERRIDE_UINT 0 +#define PIXEL_OVERRIDE_INT 1 + +struct nvdla_conv_surface_desc { + /* Data cube */ + struct nvdla_data_cube weight_data; + struct nvdla_data_cube wmb_data; + struct nvdla_data_cube wgs_data; + struct nvdla_data_cube src_data; + struct nvdla_data_cube dst_data; + + /** + * u_addr = input_data.source_addr + offset_u + * this field should be set when YUV is not interleave format + * + */ + int64_t offset_u; + + /* line stride for 2nd plane, must be 32bytes aligned */ + uint32_t in_line_uv_stride; +} __packed __aligned(4); + +struct nvdla_conv_op_desc { + /* Performance parameters */ + + /* nvdla_conv_mode */ + uint8_t conv_mode; + uint8_t data_reuse; + uint8_t weight_reuse; + uint8_t skip_data_rls; + + uint8_t skip_weight_rls; + uint8_t reserved0; + uint16_t entry_per_slice; + + /* nvdla_data_format */ + uint8_t data_format; + /* nvdla_pixel_mapping */ + uint8_t pixel_mapping; + /* number of free slices before fetch */ + uint16_t fetch_grain; + + uint8_t reserved_b[8]; + + /* batch_num */ + uint8_t batch; + /* nvdla_weight_format */ + uint8_t weight_format; + uint8_t data_bank; + uint8_t weight_bank; + + /* the offset in bytes of each data cube in a batch */ + uint32_t batch_stride; + + uint8_t post_extension; + uint8_t pixel_override; + /* number of slices need to be released */ + uint16_t release; + + /* The input cube dimension for CSC */ + uint16_t input_width_csc; + uint16_t input_height_csc; + + uint16_t input_channel_csc; + uint16_t kernel_width_csc; + + uint16_t kernel_height_csc; + uint16_t kernel_channel_csc; + + /* The input cube dimension for CMAC */ + uint16_t input_width_cmac; + uint16_t input_height_cmac; + + /* actual size in bytes */ + uint32_t bytes_per_kernel; + + /* Algorithm parameters */ + + int16_t mean_ry; /* mean value for red in RGB or Y in YUV */ + int16_t mean_gu; /* mean value for green in RGB or U in YUV */ + + int16_t mean_bv; /* mean value for blue in RGB or V in YUV */ + int16_t mean_ax; + + uint8_t mean_format; /* nvdla_mean_format */ + uint8_t conv_stride_x; + uint8_t conv_stride_y; + uint8_t pad_x_left; + + uint8_t pad_x_right; + uint8_t pad_y_top; + uint8_t pad_y_bottom; + uint8_t dilation_x; + + uint8_t dilation_y; + uint8_t reserved2[2]; + + /* Precision parameters */ + uint8_t pra_truncate; + + uint8_t in_precision; + /* The output precision from CONV, it's the MAC processing precision */ + uint8_t out_precision; + int16_t pad_val; + + /* input converter parameters */ + struct nvdla_cvt_param in_cvt; + /* output converter parameters, support truncate only */ + struct nvdla_cvt_param out_cvt; + +} __packed __aligned(4); + +struct nvdla_conv_stat_desc { + uint32_t data_read_stall; + uint32_t weight_read_stall; + uint32_t data_read_latency; + uint32_t weight_read_latency; + uint32_t saturation_count; + uint32_t nan_data_num; + uint32_t nan_weight_num; + uint32_t inf_data_num; + uint32_t inf_weight_num; +} __packed __aligned(4); + +/** + * @ingroup SDP + * @name Activation functions + * @brief Activation functions supported in SDP + * @{ + */ +#define ACTIVATION_NONE 0 +#define ACTIVATION_RELU 1 +#define ACTIVATION_LUT 2 +#define ACTIVATION_PRELU 3 +/** @} */ + +/** + * @ingroup LUT + * @name LUT size + * @brief LUT sizes for linear and exponentila LUT + * @{ + */ +#define LUT_LINEAR_EXP_TABLE_ENTRY_LOG2 6 +#define LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2 8 +/** @} */ + +/** + * @ingroup LUT + * @name LUT types + * @brief DLA supports two types of LUT, linear and exonential + * @{ + */ +#define LUT_LINEAR_EXP_TABLE 0 +#define LUT_LINEAR_ONLY_TABLE 1 +/** @} */ + +/** + * @ingroup LUT + * @name LUT methods + * @brief DLA supports two types of LUT, linear and exonential + * @{ + */ +#define LUT_METHOD_EXPONENTIAL 0 +#define LUT_METHOD_LINEAR 1 +/** @} */ + +/** + * @ingroup LUT + * @name LUT + * @brief DLA supports two types of LUT, linear and exonential + * @{ + */ +#define LUT_PRI_LINEAR_EXP 0 +#define LUT_PRI_LINEAR_ONLY 1 +/** @} */ + +union nvdla_lut_offset { + /** + * Number should be subtracted on log domain before look up + * exponential table it has the same definition as hardware + * thus input scaling should also take into account when + * set this field. + */ + int8_t exp_offset; + /** + * Number of bits should be right shift before looking + * up linear table + */ + int8_t frac_bits; + uint16_t reserved0; +}; + +/** + * This struct is used to represent floating point values by INT + * suppose we have a float point number fp_x, it will be represented + * as: + * + * fp_x = scale_int_x>>(shifter_x) + * + * This is very useful for INT pipeline; + */ +struct nvdla_float_data { + int16_t scale; + int8_t shifter; + uint8_t reserved0; +} __packed __aligned(4); + +/** + * For INT pipeline, we use the struct above to represent a floating number; + * For FP16 pipeline, we should store the FP16 encoded value into a uint16_t + * container + */ +union nvdla_slope { + struct nvdla_float_data data_i; + + uint16_t data_f; +}; + +struct nvdla_lut_param { + /** + * value of expression ((1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1) is 65, + * ((1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1) is 257, and int16_t is of + * 2Byte. And below two statement's combined memory size is 644 Byte. + * + * NOTE: below two declaration combined size should always be multiple + * of 4. + */ + int16_t linear_exp_table[(1<<LUT_LINEAR_EXP_TABLE_ENTRY_LOG2)+1]; + int16_t linear_only_table[(1<<LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2)+1]; + + union nvdla_lut_offset linear_exp_offset; + union nvdla_lut_offset linear_only_offset; + + /** + * The start and end point of raw table, + * valid when raw_method=LINEAR only + */ + uint64_t linear_exp_start; + uint64_t linear_exp_end; + uint64_t linear_only_start; + uint64_t linear_only_end; + + union nvdla_slope linear_exp_underflow_slope; + union nvdla_slope linear_exp_overflow_slope; + union nvdla_slope linear_only_underflow_slope; + union nvdla_slope linear_only_overflow_slope; + + /** + * nvdla_lut_priority, when both lut are hit(or one overflow, + * the other underflow), which one should be selected as output + */ + uint8_t hybrid_priority; + uint8_t underflow_priority; + uint8_t overflow_priority; + uint8_t method; /* nvdla_lut_method */ +} __packed __aligned(4); + +struct nvdla_sdp_surface_desc { + /* Data cube */ + /* source input cube, available when SDP working on offline mode */ + struct nvdla_data_cube src_data; + + /* X1 input cube */ + struct nvdla_data_cube x1_data; + + /* X2 input cube */ + struct nvdla_data_cube x2_data; + + /* Y input cube */ + struct nvdla_data_cube y_data; + + /* Output cube */ + struct nvdla_data_cube dst_data; +} __packed __aligned(4); + +#define SDP_OP_NONE 0 +#define SDP_OP_MUL 1 +#define SDP_OP_ADD 2 +#define SDP_OP_BOTH 3 + +#define SDP_ALU_OP_MAX 0 +#define SDP_ALU_OP_MIN 1 +#define SDP_ALU_OP_SUM 2 +#define SDP_ALU_OP_EQL 3 + +#define SDP_OP_PER_LAYER 0 +#define SDP_OP_PER_KERNEL 1 +#define SDP_OP_PER_POINT 2 + +struct nvdla_sdp_cvt { + struct nvdla_cvt_param alu_cvt; + struct nvdla_cvt_param mul_cvt; +} __packed __aligned(4); + +struct nvdla_sdp_op { + uint8_t enable; + uint8_t alu_type; /* nvdla_sdp_alu_op_type */ + uint8_t type; /* nvdla_sdp_op_type */ + uint8_t mode; /* nvdla_sdp_op_mode */ + + uint8_t act; /* nvdla_act_type */ + uint8_t shift_value; /* left shift */ + uint8_t truncate; + uint8_t precision; + + int32_t alu_operand; + int32_t mul_operand; + + struct nvdla_sdp_cvt cvt; +} __packed __aligned(4); + +struct nvdla_sdp_op_desc { + /* Precision parameters */ + /* nvdla_precision */ + uint8_t src_precision; + uint8_t dst_precision; + int16_t lut_index; + + struct nvdla_cvt_param out_cvt; + + /* Performance parameters */ + /* nvdla_conv_mode */ + uint8_t conv_mode; + uint8_t batch_num; + uint16_t reserved0; + + uint32_t batch_stride; /* will be used when batch_num > 1 */ + + /* Algorithm parameters */ + struct nvdla_sdp_op x1_op; + struct nvdla_sdp_op x2_op; + struct nvdla_sdp_op y_op; +} __packed __aligned(4); + +#define POOL_MODE_AVG 0 +#define POOL_MODE_MAX 1 +#define POOL_MODE_MIN 2 + +#define POOL_SIZE_1 0 +#define POOL_SIZE_2 1 +#define POOL_SIZE_3 2 +#define POOL_SIZE_4 3 +#define POOL_SIZE_5 4 +#define POOL_SIZE_6 5 +#define POOL_SIZE_7 6 +#define POOL_SIZE_8 7 + +#define PDP_PAD_VAL_NUM 7 + +struct nvdla_pdp_surface_desc { + /* Data cube */ + struct nvdla_data_cube src_data; + + struct nvdla_data_cube dst_data; +} __packed __aligned(4); + +struct nvdla_pdp_op_desc { + /* Performance parameters */ + uint16_t partial_in_width_first; + uint16_t partial_in_width_mid; + + uint16_t partial_in_width_last; + uint16_t partial_width_first; + + uint16_t partial_width_mid; + uint16_t partial_width_last; + + uint8_t split_num; + + /* Algorithm parameters */ + uint8_t pool_mode; /* nvdla_pool_mode */ + uint8_t pool_width; /* nvdla_pool_width */ + uint8_t pool_height; /* nvdla_pool_height */ + + uint8_t stride_x; + uint8_t stride_y; + + /** + * The left/right padding size, + * pad_right might be less than pad_left + */ + uint8_t pad_left; + uint8_t pad_right; + + /* The top/bottom padding size */ + uint8_t pad_top; + uint8_t pad_bottom; + + /* Precision parameters */ + uint8_t precision; /* nvdla_precision */ + uint8_t reserved0; + /** + * if input has non-zero "offset", this value should be set + * There'll be 7 different paddding values, the relationship between + * those versions are: + * padding_value[0] = -offset*scaling; + * padding_value[1] = 2*padding_value[0] + * padding_value[2] = 3*padding_value[0] + * ... + * The purpose is to avoid ucode implement FP16 + * multiplier(for FP16 mode) + */ + int32_t padding_value[PDP_PAD_VAL_NUM]; +} __packed __aligned(4); + +struct nvdla_pdp_stat_desc { + uint32_t inf_input_num; + uint32_t nan_input_num; + uint32_t nan_output_num; + uint32_t write_stall; + uint32_t runtime; +} __packed __aligned(4); + +struct nvdla_cdp_surface_desc { + /* Data cube */ + struct nvdla_data_cube src_data; + + struct nvdla_data_cube dst_data; +} __packed __aligned(4); + +struct nvdla_cdp_op_desc { + /* Precision parameters */ + + /* nvdla_precision */ + uint8_t in_precision; + uint8_t out_precision; + int16_t lut_index; + + struct nvdla_cvt_param in_cvt; + struct nvdla_cvt_param out_cvt; + + /* Performance parameters */ + + /* Algorithm parameters */ + uint8_t local_size; + uint8_t bypass_sqsum; + uint8_t bypass_out_mul; + uint8_t reserved0; +} __packed __aligned(4); + +struct nvdla_cdp_stat_desc { + uint32_t nan_input_num; + uint32_t inf_input_num; + uint32_t nan_output_num; + uint32_t write_stall; + uint32_t lut_uflow; + uint32_t lut_oflow; + uint32_t lut_hybrid; + uint32_t lut_le_hit; + uint32_t lut_lo_hit; + uint32_t saturation_count; + uint32_t runtime; +} __packed __aligned(4); + +struct nvdla_rubik_surface_desc { + /* Data cube */ + struct nvdla_data_cube src_data; + + struct nvdla_data_cube dst_data; +} __packed __aligned(4); + +/* rubik mode */ +#define RUBIK_MODE_CONTRACT 0 +#define RUBIK_MODE_SPLIT 1 +#define RUBIK_MODE_MERGE 2 + +struct nvdla_rubik_op_desc { + /* Precision parameters */ + uint8_t mode; + uint8_t precision; + uint8_t stride_x; + uint8_t stride_y; +} __packed __aligned(4); + +struct nvdla_rubik_stat_desc { + uint32_t read_stall; + uint32_t write_stall; + uint32_t runtime; +} __packed __aligned(4); + +union nvdla_surface_container { + struct nvdla_bdma_surface_desc bdma_surface; + struct nvdla_conv_surface_desc conv_surface; + struct nvdla_sdp_surface_desc sdp_surface; + struct nvdla_pdp_surface_desc pdp_surface; + struct nvdla_cdp_surface_desc cdp_surface; + struct nvdla_rubik_surface_desc rubik_surface; +}; + +union nvdla_operation_container { + struct nvdla_bdma_op_desc bdma_op; + struct nvdla_conv_op_desc conv_op; + struct nvdla_sdp_op_desc sdp_op; + struct nvdla_pdp_op_desc pdp_op; + struct nvdla_cdp_op_desc cdp_op; + struct nvdla_rubik_op_desc rubik_op; +}; + +struct nvdla_engine; +void update_lut(struct nvdla_engine *engine, uint32_t reg_base, + struct nvdla_lut_param *lut, + uint8_t precision); +int32_t validate_data_cube(struct nvdla_data_cube src_data_cube, + struct nvdla_data_cube dst_data_cube, + uint8_t mem_type); +int32_t validate_precision(uint8_t precision, + uint8_t map_precision); + +#endif diff --git a/drivers/gpu/drm/nvdla/nvdla_conv.c b/drivers/gpu/drm/nvdla/nvdla_conv.c new file mode 100644 index 000000000000..3740b2ab2915 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_conv.c @@ -0,0 +1,684 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_common.h" +#include "nvdla_drv.h" +#include "nvdla_reg.h" +#include "nvdla_common.h" +#include "nvdla_engine.h" + +static const uint8_t map_precision[] = { + FIELD_ENUM(CDMA_D_MISC_CFG_0, IN_PRECISION, INT8), + FIELD_ENUM(CDMA_D_MISC_CFG_0, IN_PRECISION, INT16), + FIELD_ENUM(CDMA_D_MISC_CFG_0, IN_PRECISION, FP16), +}; + +static const uint8_t map_conv[] = { + FIELD_ENUM(CACC_D_MISC_CFG_0, CONV_MODE, DIRECT), + FIELD_ENUM(CACC_D_MISC_CFG_0, CONV_MODE, WINOGRAD), +}; + +static const uint8_t map_weight_fmt[] = { + FIELD_ENUM(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT, UNCOMPRESSED), + FIELD_ENUM(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT, COMPRESSED), +}; + +static const uint8_t map_img_fmt[][2] = { + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_R8), 1}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_R10), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_R12), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_R16), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_R16_I), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_R16_F), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A16B16G16R16), 8}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_X16B16G16R16), 8}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A16B16G16R16_F), 8}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A16Y16U16V16), 8}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_V16U16Y16A16), 8}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A16Y16U16V16_F), 8}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A8B8G8R8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A8R8G8B8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_B8G8R8A8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_R8G8B8A8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_X8B8G8R8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_X8R8G8B8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_B8G8R8X8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_R8G8B8X8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A2B10G10R10), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A2R10G10B10), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_B10G10R10A2), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_R10G10B10A2), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A2Y10U10V10), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_V10U10Y10A2), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_A8Y8U8V8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_V8U8Y8A8), 4}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_Y8___U8V8_N444), 1}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_Y8___V8U8_N444), 1}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_Y10___U10V10_N444), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_Y10___V10U10_N444), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_Y12___U12V12_N444), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_Y12___V12U12_N444), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_Y16___U16V16_N444), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + PIXEL_FORMAT, T_Y16___V16U16_N444), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + DATAIN_FORMAT, FEATURE), 2}, + {FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + DATAIN_FORMAT, PIXEL), 1}, +}; + +static const uint8_t map_pixel[] = { + FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, PIXEL_MAPPING, PITCH_LINEAR), +}; + +static const uint8_t map_ram[] = { + FIELD_ENUM(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, MCIF), + FIELD_ENUM(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, CVIF), +}; + +static const uint8_t map_mean[] = { + FIELD_ENUM(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT, DISABLE), + FIELD_ENUM(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT, ENABLE), +}; + +static uint32_t +get_in_format(uint8_t format) +{ + uint32_t in_format = 0; + + if (format >= FORMAT_T_R8 && format < FORMAT_FEATURE) { + in_format = FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + DATAIN_FORMAT, PIXEL); + } else if (format == FORMAT_FEATURE) { + in_format = FIELD_ENUM(CDMA_D_DATAIN_FORMAT_0, + DATAIN_FORMAT, FEATURE); + } + + return in_format; +} + +void +nvdla_conv_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id) +{ + uint32_t reg; + + /* set producer pointer for all sub-modules */ + reg = group_id << SHIFT(CACC_S_POINTER_0, PRODUCER); + cacc_reg_write(engine, S_POINTER, reg); + cmac_a_reg_write(engine, S_POINTER, reg); + cmac_b_reg_write(engine, S_POINTER, reg); + csc_reg_write(engine, S_POINTER, reg); + cdma_reg_write(engine, S_POINTER, reg); +} + +int +nvdla_conv_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + uint32_t reg; + + do { + reg = cdma_reg_read(engine, S_CBUF_FLUSH_STATUS); + } while (!(reg & MASK(CDMA_S_CBUF_FLUSH_STATUS_0, FLUSH_DONE))); + + /* enable all sub-modules */ + reg = FIELD_ENUM(CACC_D_OP_ENABLE_0, OP_EN, ENABLE); + cacc_reg_write(engine, D_OP_ENABLE, reg); + cmac_a_reg_write(engine, D_OP_ENABLE, reg); + cmac_b_reg_write(engine, D_OP_ENABLE, reg); + csc_reg_write(engine, D_OP_ENABLE, reg); + cdma_reg_write(engine, D_OP_ENABLE, reg); + + return 0; +} + +void +nvdla_conv_rdma_check(struct nvdla_processor_group *group) +{ + group->is_rdma_needed = 0; +} + +static int32_t +processor_conv_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret = 0; + uint32_t reg, high, low, shift, mask; + uint32_t stride_x, stride_y, pad_x, pad_y; + uint64_t weight_address = 0; + uint64_t wmb_address = 0; + uint64_t wgs_address = 0; + uint64_t input_address = 0; + uint64_t output_address = 0; + uint32_t atom_size = 0; + bool weight_compress_support = false; + struct nvdla_conv_op_desc *conv_op; + struct nvdla_conv_surface_desc *conv_surface; + + weight_compress_support = engine->config_data->weight_compress_support; + atom_size = engine->config_data->atom_size; + conv_op = &group->operation_desc->conv_op; + conv_surface = &group->surface_desc->conv_surface; + + if (conv_op->weight_format == WEIGHT_FORMAT_COMPRESSED) { + ASSERT_GOTO((weight_compress_support), ret, -EINVAL, exit); + ASSERT_GOTO((conv_surface->wmb_data.address != -1), + ret, -EINVAL, exit); + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + conv_surface->wmb_data.address, + conv_surface->wmb_data.offset, + (void *)&wmb_address, + DESTINATION_DMA); + + ASSERT_GOTO((conv_surface->wgs_data.address != -1), + ret, -EINVAL, exit); + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + conv_surface->wgs_data.address, + conv_surface->wgs_data.offset, + (void *)&wgs_address, + DESTINATION_DMA); + } + + if (conv_surface->weight_data.address != -1) { + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + conv_surface->weight_data.address, + conv_surface->weight_data.offset, + (void *)&weight_address, + DESTINATION_DMA); + } + + if (conv_surface->dst_data.address != -1) { + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + conv_surface->dst_data.address, + conv_surface->dst_data.offset, + (void *)&output_address, + DESTINATION_DMA); + } + + ret = nvdla_read_input_address(engine, &conv_surface->src_data, + &input_address, + group->op_desc->index, + group->roi_index, + map_img_fmt[conv_op->data_format][1]); + if (ret) + goto exit; + + ASSERT_GOTO((conv_op->out_cvt.scale == 1), ret, -EINVAL, exit); + ASSERT_GOTO((conv_op->out_cvt.offset == 0), ret, -EINVAL, exit); + + /* check if the register group is idle */ + reg = cacc_reg_read(engine, S_STATUS); + mask = group->id ? MASK(CACC_S_STATUS_0, STATUS_1) : + MASK(CACC_S_STATUS_0, STATUS_0); + shift = group->id ? SHIFT(CACC_S_STATUS_0, STATUS_1) : + SHIFT(CACC_S_STATUS_0, STATUS_0); + reg = (reg & mask) >> shift; + ASSERT_GOTO((reg == FIELD_ENUM(CACC_S_STATUS_0, STATUS_0, IDLE)), + ret, -EINVAL, exit); + + reg = cmac_a_reg_read(engine, S_STATUS); + mask = group->id ? MASK(CMAC_A_S_STATUS_0, STATUS_1) : + MASK(CMAC_A_S_STATUS_0, STATUS_0); + shift = group->id ? SHIFT(CMAC_A_S_STATUS_0, STATUS_1) : + SHIFT(CMAC_A_S_STATUS_0, STATUS_0); + reg = (reg & mask) >> shift; + ASSERT_GOTO((reg == FIELD_ENUM(CMAC_A_S_STATUS_0, STATUS_0, IDLE)), + ret, -EINVAL, exit); + + reg = cmac_b_reg_read(engine, S_STATUS); + mask = group->id ? MASK(CMAC_B_S_STATUS_0, STATUS_1) : + MASK(CMAC_B_S_STATUS_0, STATUS_0); + shift = group->id ? SHIFT(CMAC_B_S_STATUS_0, STATUS_1) : + SHIFT(CMAC_B_S_STATUS_0, STATUS_0); + reg = (reg & mask) >> shift; + ASSERT_GOTO((reg == FIELD_ENUM(CMAC_B_S_STATUS_0, STATUS_0, IDLE)), + ret, -EINVAL, exit); + + reg = csc_reg_read(engine, S_STATUS); + mask = group->id ? MASK(CSC_S_STATUS_0, STATUS_1) : + MASK(CSC_S_STATUS_0, STATUS_0); + shift = group->id ? SHIFT(CSC_S_STATUS_0, STATUS_1) : + SHIFT(CSC_S_STATUS_0, STATUS_0); + reg = (reg & mask) >> shift; + ASSERT_GOTO((reg == FIELD_ENUM(CSC_S_STATUS_0, STATUS_0, IDLE)), + ret, -EINVAL, exit); + + reg = cdma_reg_read(engine, S_STATUS); + mask = group->id ? MASK(CDMA_S_STATUS_0, STATUS_1) : + MASK(CDMA_S_STATUS_0, STATUS_0); + shift = group->id ? SHIFT(CDMA_S_STATUS_0, STATUS_1) : + SHIFT(CDMA_S_STATUS_0, STATUS_0); + reg = (reg & mask) >> shift; + ASSERT_GOTO((reg == FIELD_ENUM(CDMA_S_STATUS_0, STATUS_0, IDLE)), + ret, -EINVAL, exit); + + /* reverse config each sub-module in CC */ + + /* CACC */ + reg = (map_conv[conv_op->conv_mode] + << SHIFT(CACC_D_MISC_CFG_0, CONV_MODE)) | + (map_precision[conv_op->out_precision] + << SHIFT(CACC_D_MISC_CFG_0, PROC_PRECISION)); + cacc_reg_write(engine, D_MISC_CFG, reg); + + reg = ((conv_surface->dst_data.width - 1) + << SHIFT(CACC_D_DATAOUT_SIZE_0_0, DATAOUT_WIDTH)) | + ((conv_surface->dst_data.height - 1) + << SHIFT(CACC_D_DATAOUT_SIZE_0_0, DATAOUT_HEIGHT)); + cacc_reg_write(engine, D_DATAOUT_SIZE_0, reg); + + reg = ((conv_surface->dst_data.channel - 1) + << SHIFT(CACC_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL)); + cacc_reg_write(engine, D_DATAOUT_SIZE_1, reg); + + low = lower_32_bits(output_address); + cacc_reg_write(engine, D_DATAOUT_ADDR, low); + cacc_reg_write(engine, D_BATCH_NUMBER, conv_op->batch - 1); + cacc_reg_write(engine, D_LINE_STRIDE, conv_surface->dst_data.line_stride); + cacc_reg_write(engine, D_SURF_STRIDE, conv_surface->dst_data.surf_stride); + + if (conv_surface->dst_data.width == 1 && + conv_surface->dst_data.height == 1) { + ASSERT_GOTO((((uint32_t)conv_surface->dst_data.line_stride == + (uint32_t)(conv_surface->dst_data.width * atom_size))), + ret, -EINVAL, exit); + reg = (CACC_D_DATAOUT_MAP_0_LINE_PACKED_TRUE << + SHIFT(CACC_D_DATAOUT_MAP_0, LINE_PACKED)); + reg |= (CACC_D_DATAOUT_MAP_0_SURF_PACKED_TRUE << + SHIFT(CACC_D_DATAOUT_MAP_0, SURF_PACKED)); + } else { + reg = (FIELD_ENUM(CACC_D_DATAOUT_MAP_0, LINE_PACKED, FALSE) << + SHIFT(CACC_D_DATAOUT_MAP_0, LINE_PACKED)); + reg |= (FIELD_ENUM(CACC_D_DATAOUT_MAP_0, SURF_PACKED, FALSE) << + SHIFT(CACC_D_DATAOUT_MAP_0, SURF_PACKED)); + } + cacc_reg_write(engine, D_DATAOUT_MAP, reg); + + cacc_reg_write(engine, D_CLIP_CFG, conv_op->out_cvt.truncate); + + /* CMAC */ + reg = (map_conv[conv_op->conv_mode] + << SHIFT(CMAC_A_D_MISC_CFG_0, CONV_MODE)) | + (map_precision[conv_op->out_precision] + << SHIFT(CMAC_A_D_MISC_CFG_0, PROC_PRECISION)); + cmac_a_reg_write(engine, D_MISC_CFG, reg); + cmac_b_reg_write(engine, D_MISC_CFG, reg); + + /* CSC */ + reg = (map_conv[conv_op->conv_mode] + << SHIFT(CSC_D_MISC_CFG_0, CONV_MODE)) | + (map_precision[conv_op->out_precision] + << SHIFT(CSC_D_MISC_CFG_0, IN_PRECISION)) | + (map_precision[conv_op->out_precision] + << SHIFT(CSC_D_MISC_CFG_0, PROC_PRECISION)) | + (conv_op->data_reuse + << SHIFT(CSC_D_MISC_CFG_0, DATA_REUSE)) | + (conv_op->weight_reuse + << SHIFT(CSC_D_MISC_CFG_0, WEIGHT_REUSE)) | + (conv_op->skip_data_rls + << SHIFT(CSC_D_MISC_CFG_0, SKIP_DATA_RLS)) | + (conv_op->skip_weight_rls + << SHIFT(CSC_D_MISC_CFG_0, SKIP_WEIGHT_RLS)); + csc_reg_write(engine, D_MISC_CFG, reg); + + reg = (get_in_format(conv_op->data_format) << + SHIFT(CSC_D_DATAIN_FORMAT_0, DATAIN_FORMAT)); + csc_reg_write(engine, D_DATAIN_FORMAT, reg); + + reg = ((conv_op->input_width_csc - 1) + << SHIFT(CSC_D_DATAIN_SIZE_EXT_0_0, DATAIN_WIDTH_EXT)) | + ((conv_op->input_height_csc - 1) + << SHIFT(CSC_D_DATAIN_SIZE_EXT_0_0, DATAIN_HEIGHT_EXT)); + csc_reg_write(engine, D_DATAIN_SIZE_EXT_0, reg); + + reg = ((conv_op->input_channel_csc - 1) + << SHIFT(CSC_D_DATAIN_SIZE_EXT_1_0, DATAIN_CHANNEL_EXT)); + csc_reg_write(engine, D_DATAIN_SIZE_EXT_1, reg); + + reg = ((conv_op->batch - 1) + << SHIFT(CSC_D_BATCH_NUMBER_0, BATCHES)); + csc_reg_write(engine, D_BATCH_NUMBER, reg); + reg = ((conv_op->post_extension) + << SHIFT(CSC_D_POST_Y_EXTENSION_0, Y_EXTENSION)); + csc_reg_write(engine, D_POST_Y_EXTENSION, reg); + + reg = ((conv_op->entry_per_slice - 1) + << SHIFT(CSC_D_ENTRY_PER_SLICE_0, ENTRIES)); + csc_reg_write(engine, D_ENTRY_PER_SLICE, reg); + + reg = (map_weight_fmt[conv_op->weight_format] + << SHIFT(CSC_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT)); + csc_reg_write(engine, D_WEIGHT_FORMAT, reg); + + reg = ((conv_op->kernel_width_csc - 1) + << SHIFT(CSC_D_WEIGHT_SIZE_EXT_0_0, WEIGHT_WIDTH_EXT)) | + ((conv_op->kernel_height_csc - 1) + << SHIFT(CSC_D_WEIGHT_SIZE_EXT_0_0, WEIGHT_HEIGHT_EXT)); + csc_reg_write(engine, D_WEIGHT_SIZE_EXT_0, reg); + + reg = ((conv_op->kernel_channel_csc - 1) + << SHIFT(CSC_D_WEIGHT_SIZE_EXT_1_0, WEIGHT_CHANNEL_EXT)) | + ((conv_surface->dst_data.channel - 1) + << SHIFT(CSC_D_WEIGHT_SIZE_EXT_1_0, WEIGHT_KERNEL)); + csc_reg_write(engine, D_WEIGHT_SIZE_EXT_1, reg); + + csc_reg_write(engine, D_WEIGHT_BYTES, conv_surface->weight_data.size); + csc_reg_write(engine, D_WMB_BYTES, conv_surface->wmb_data.size); + + reg = ((conv_op->input_width_cmac - 1) + << SHIFT(CSC_D_DATAOUT_SIZE_0_0, DATAOUT_WIDTH)) | + ((conv_op->input_height_cmac - 1) + << SHIFT(CSC_D_DATAOUT_SIZE_0_0, DATAOUT_HEIGHT)); + csc_reg_write(engine, D_DATAOUT_SIZE_0, reg); + + reg = ((conv_surface->dst_data.channel - 1) + << SHIFT(CSC_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL)); + csc_reg_write(engine, D_DATAOUT_SIZE_1, reg); + + reg = ((conv_surface->dst_data.width * + conv_surface->dst_data.height - 1) + << SHIFT(CSC_D_ATOMICS_0, ATOMICS)); + csc_reg_write(engine, D_ATOMICS, reg); + reg = ((conv_op->release - 1) + << SHIFT(CSC_D_RELEASE_0, RLS_SLICES)); + csc_reg_write(engine, D_RELEASE, reg); + + if (conv_op->conv_mode == CONV_MODE_DIRECT) { + stride_x = conv_op->conv_stride_x - 1; + stride_y = conv_op->conv_stride_y - 1; + pad_x = conv_op->pad_x_left; + pad_y = conv_op->pad_y_top; + } else { + stride_x = 0; + stride_y = 0; + pad_x = 0; + pad_y = 0; + } + + reg = (stride_x + << SHIFT(CSC_D_CONV_STRIDE_EXT_0, CONV_X_STRIDE_EXT)) | + (stride_y + << SHIFT(CSC_D_CONV_STRIDE_EXT_0, CONV_Y_STRIDE_EXT)); + csc_reg_write(engine, D_CONV_STRIDE_EXT, reg); + + reg = ((conv_op->dilation_x - 1) + << SHIFT(CSC_D_DILATION_EXT_0, X_DILATION_EXT)) | + ((conv_op->dilation_y - 1) + << SHIFT(CSC_D_DILATION_EXT_0, Y_DILATION_EXT)); + csc_reg_write(engine, D_DILATION_EXT, reg); + + reg = (pad_x + << SHIFT(CSC_D_ZERO_PADDING_0, PAD_LEFT)) | + (pad_y + << SHIFT(CSC_D_ZERO_PADDING_0, PAD_TOP)); + csc_reg_write(engine, D_ZERO_PADDING, reg); + + reg = (conv_op->pad_val + << SHIFT(CSC_D_ZERO_PADDING_VALUE_0, PAD_VALUE)) & + MASK(CSC_D_ZERO_PADDING_VALUE_0, PAD_VALUE); + csc_reg_write(engine, D_ZERO_PADDING_VALUE, reg); + + reg = ((conv_op->data_bank - 1) + << SHIFT(CSC_D_BANK_0, DATA_BANK)) | + ((conv_op->weight_bank - 1) + << SHIFT(CSC_D_BANK_0, WEIGHT_BANK)); + csc_reg_write(engine, D_BANK, reg); + csc_reg_write(engine, D_PRA_CFG, conv_op->pra_truncate); + + /* CBUF */ + /* there's no CBUF register */ + + /* CDMA */ + reg = (map_conv[conv_op->conv_mode] + << SHIFT(CDMA_D_MISC_CFG_0, CONV_MODE)) | + (map_precision[conv_op->in_precision] + << SHIFT(CDMA_D_MISC_CFG_0, IN_PRECISION)) | + (map_precision[conv_op->out_precision] + << SHIFT(CDMA_D_MISC_CFG_0, PROC_PRECISION)) | + (conv_op->data_reuse + << SHIFT(CDMA_D_MISC_CFG_0, DATA_REUSE)) | + (conv_op->weight_reuse + << SHIFT(CDMA_D_MISC_CFG_0, WEIGHT_REUSE)) | + (conv_op->skip_data_rls + << SHIFT(CDMA_D_MISC_CFG_0, SKIP_DATA_RLS)) | + (conv_op->skip_weight_rls + << SHIFT(CDMA_D_MISC_CFG_0, SKIP_WEIGHT_RLS)); + cdma_reg_write(engine, D_MISC_CFG, reg); + + reg = (get_in_format(conv_op->data_format) << + SHIFT(CDMA_D_DATAIN_FORMAT_0, DATAIN_FORMAT)) | + (map_img_fmt[conv_op->data_format][0] + << SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_FORMAT)) | + (map_pixel[conv_op->pixel_mapping] + << SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_MAPPING)) | + (conv_op->pixel_override + << SHIFT(CDMA_D_DATAIN_FORMAT_0, PIXEL_SIGN_OVERRIDE)); + cdma_reg_write(engine, D_DATAIN_FORMAT, reg); + + reg = ((conv_surface->src_data.width - 1) + << SHIFT(CDMA_D_DATAIN_SIZE_0_0, DATAIN_WIDTH)) | + ((conv_surface->src_data.height - 1) + << SHIFT(CDMA_D_DATAIN_SIZE_0_0, DATAIN_HEIGHT)); + cdma_reg_write(engine, D_DATAIN_SIZE_0, reg); + + reg = ((conv_surface->src_data.channel - 1) + << SHIFT(CDMA_D_DATAIN_SIZE_1_0, DATAIN_CHANNEL)); + cdma_reg_write(engine, D_DATAIN_SIZE_1, reg); + + reg = ((conv_op->input_width_csc - 1) + << SHIFT(CDMA_D_DATAIN_SIZE_EXT_0_0, DATAIN_WIDTH_EXT)) | + ((conv_op->input_height_csc - 1) + << SHIFT(CDMA_D_DATAIN_SIZE_EXT_0_0, DATAIN_HEIGHT_EXT)); + cdma_reg_write(engine, D_DATAIN_SIZE_EXT_0, reg); + + reg = (map_ram[conv_surface->src_data.type] + << SHIFT(CDMA_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE)); + cdma_reg_write(engine, D_DAIN_RAM_TYPE, reg); + + high = upper_32_bits(input_address); + low = lower_32_bits(input_address); + cdma_reg_write(engine, D_DAIN_ADDR_HIGH_0, high); + cdma_reg_write(engine, D_DAIN_ADDR_LOW_0, low); + + high = upper_32_bits((input_address + conv_surface->offset_u)); + low = lower_32_bits(input_address + conv_surface->offset_u); + cdma_reg_write(engine, D_DAIN_ADDR_HIGH_1, high); + cdma_reg_write(engine, D_DAIN_ADDR_LOW_1, low); + + cdma_reg_write(engine, D_LINE_STRIDE, conv_surface->src_data.line_stride); + cdma_reg_write(engine, D_SURF_STRIDE, conv_surface->src_data.surf_stride); + cdma_reg_write(engine, D_LINE_UV_STRIDE, conv_surface->in_line_uv_stride); + + reg = ((conv_surface->src_data.line_stride == + ((uint32_t)conv_surface->src_data.width * atom_size)) + << SHIFT(CDMA_D_DAIN_MAP_0, LINE_PACKED)); + reg |= ((conv_surface->src_data.surf_stride == + ((uint32_t)(conv_surface->src_data.width * + conv_surface->src_data.height) * atom_size)) + << SHIFT(CDMA_D_DAIN_MAP_0, SURF_PACKED)); + cdma_reg_write(engine, D_DAIN_MAP, reg); + + reg = ((conv_op->batch - 1) + << SHIFT(CDMA_D_BATCH_NUMBER_0, BATCHES)); + cdma_reg_write(engine, D_BATCH_NUMBER, reg); + + cdma_reg_write(engine, D_BATCH_STRIDE, conv_op->batch_stride); + + reg = ((conv_op->entry_per_slice - 1) + << SHIFT(CDMA_D_ENTRY_PER_SLICE_0, ENTRIES)); + cdma_reg_write(engine, D_ENTRY_PER_SLICE, reg); + + reg = ((conv_op->fetch_grain - 1) + << SHIFT(CDMA_D_FETCH_GRAIN_0, GRAINS)); + cdma_reg_write(engine, D_FETCH_GRAIN, reg); + + reg = (map_weight_fmt[conv_op->weight_format] + << SHIFT(CDMA_D_WEIGHT_FORMAT_0, WEIGHT_FORMAT)); + cdma_reg_write(engine, D_WEIGHT_FORMAT, reg); + + reg = ((conv_op->bytes_per_kernel - 1) + << SHIFT(CDMA_D_WEIGHT_SIZE_0_0, BYTE_PER_KERNEL)); + cdma_reg_write(engine, D_WEIGHT_SIZE_0, reg); + + reg = ((conv_surface->dst_data.channel - 1) + << SHIFT(CDMA_D_WEIGHT_SIZE_1_0, WEIGHT_KERNEL)); + cdma_reg_write(engine, D_WEIGHT_SIZE_1, reg); + + reg = (map_ram[conv_surface->weight_data.type] + << SHIFT(CDMA_D_WEIGHT_RAM_TYPE_0, WEIGHT_RAM_TYPE)); + cdma_reg_write(engine, D_WEIGHT_RAM_TYPE, reg); + + high = upper_32_bits(weight_address); + low = lower_32_bits(weight_address); + cdma_reg_write(engine, D_WEIGHT_ADDR_HIGH, high); + cdma_reg_write(engine, D_WEIGHT_ADDR_LOW, low); + cdma_reg_write(engine, D_WEIGHT_BYTES, conv_surface->weight_data.size); + + if (conv_op->weight_format == WEIGHT_FORMAT_COMPRESSED) { + high = upper_32_bits(wgs_address); + low = lower_32_bits(wgs_address); + cdma_reg_write(engine, D_WGS_ADDR_HIGH, high); + cdma_reg_write(engine, D_WGS_ADDR_LOW, low); + + high = upper_32_bits(wmb_address); + low = lower_32_bits(wmb_address); + cdma_reg_write(engine, D_WMB_ADDR_HIGH, high); + cdma_reg_write(engine, D_WMB_ADDR_LOW, low); + cdma_reg_write(engine, D_WMB_BYTES, conv_surface->wmb_data.size); + } + + reg = (map_mean[conv_op->mean_format] + << SHIFT(CDMA_D_MEAN_FORMAT_0, MEAN_FORMAT)); + cdma_reg_write(engine, D_MEAN_FORMAT, reg); + + if (conv_op->mean_format == MEAN_FORMAT_ENABLE) { + reg = ((conv_op->mean_ry + << SHIFT(CDMA_D_MEAN_GLOBAL_0_0, MEAN_RY)) & + MASK(CDMA_D_MEAN_GLOBAL_0_0, MEAN_RY)) | + ((conv_op->mean_gu + << SHIFT(CDMA_D_MEAN_GLOBAL_0_0, MEAN_GU)) & + MASK(CDMA_D_MEAN_GLOBAL_0_0, MEAN_GU)); + cdma_reg_write(engine, D_MEAN_GLOBAL_0, reg); + + reg = ((conv_op->mean_bv + << SHIFT(CDMA_D_MEAN_GLOBAL_1_0, MEAN_BV))& + MASK(CDMA_D_MEAN_GLOBAL_1_0, MEAN_BV)) | + ((conv_op->mean_ax + << SHIFT(CDMA_D_MEAN_GLOBAL_1_0, MEAN_AX))& + MASK(CDMA_D_MEAN_GLOBAL_1_0, MEAN_AX)); + cdma_reg_write(engine, D_MEAN_GLOBAL_1, reg); + } + + if (conv_op->in_cvt.enable) { + reg = ((FIELD_ENUM(CDMA_D_CVT_CFG_0, CVT_EN, ENABLE)) + << SHIFT(CDMA_D_CVT_CFG_0, CVT_EN)) | + (conv_op->in_cvt.truncate + << SHIFT(CDMA_D_CVT_CFG_0, CVT_TRUNCATE)); + cdma_reg_write(engine, D_CVT_CFG, reg); + cdma_reg_write(engine, D_CVT_OFFSET, conv_op->in_cvt.offset); + cdma_reg_write(engine, D_CVT_SCALE, conv_op->in_cvt.scale); + } else { + reg = ((FIELD_ENUM(CDMA_D_CVT_CFG_0, CVT_EN, DISABLE)) + << SHIFT(CDMA_D_CVT_CFG_0, CVT_EN)); + cdma_reg_write(engine, D_CVT_CFG, reg); + } + + reg = ((conv_op->conv_stride_x - 1) + << SHIFT(CDMA_D_CONV_STRIDE_0, CONV_X_STRIDE)) | + ((conv_op->conv_stride_y - 1) + << SHIFT(CDMA_D_CONV_STRIDE_0, CONV_Y_STRIDE)); + cdma_reg_write(engine, D_CONV_STRIDE, reg); + + reg = (conv_op->pad_x_left << + SHIFT(CDMA_D_ZERO_PADDING_0, PAD_LEFT)) | + (conv_op->pad_x_right + << SHIFT(CDMA_D_ZERO_PADDING_0, PAD_RIGHT)) | + (conv_op->pad_y_top + << SHIFT(CDMA_D_ZERO_PADDING_0, PAD_TOP)) | + (conv_op->pad_y_bottom + << SHIFT(CDMA_D_ZERO_PADDING_0, PAD_BOTTOM)); + cdma_reg_write(engine, D_ZERO_PADDING, reg); + + reg = conv_op->pad_val << + SHIFT(CDMA_D_ZERO_PADDING_VALUE_0, PAD_VALUE) & + MASK(CDMA_D_ZERO_PADDING_VALUE_0, PAD_VALUE); + cdma_reg_write(engine, D_ZERO_PADDING_VALUE, reg); + reg = ((conv_op->weight_bank - 1) + << SHIFT(CDMA_D_BANK_0, WEIGHT_BANK)) | + ((conv_op->data_bank - 1) + << SHIFT(CDMA_D_BANK_0, DATA_BANK)); + cdma_reg_write(engine, D_BANK, reg); + +exit: + return ret; +} + +int +nvdla_conv_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group) +{ + return 1; +} + +void +nvdla_conv_dump_config(struct nvdla_processor_group *group) +{ + struct nvdla_conv_op_desc *conv_op; + struct nvdla_conv_surface_desc *conv_surface; + + conv_surface = &group->surface_desc->conv_surface; + conv_op = &group->operation_desc->conv_op; +} + +int +nvdla_conv_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret; + + ret = processor_conv_program(engine, group); + if (ret) + goto exit; + +exit: + return ret; +} diff --git a/drivers/gpu/drm/nvdla/nvdla_drv.c b/drivers/gpu/drm/nvdla/nvdla_drv.c new file mode 100644 index 000000000000..495ce5fc2e4e --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_drv.c @@ -0,0 +1,694 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include <linux/dma-buf.h> +#include <linux/dma-mapping.h> +#include <linux/fs.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/of_device.h> +#include <linux/of_irq.h> +#include <linux/of_platform.h> +#include <linux/platform_device.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/time.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <drm/drm_drv.h> +#include <drm/drm_print.h> +#include <drm/nvdla_drm.h> + +#include "nvdla_drv.h" +#include "nvdla_engine.h" + +static struct nvdla_config nvdla_config_os_initial = { + .atom_size = 32, + .bdma_enable = true, + .rubik_enable = true, + .weight_compress_support = true, +}; + +static struct nvdla_config nvdla_config_small = { + //.atom_size = 8, + .atom_size = 32, // nv_large config + .bdma_enable = false, + .rubik_enable = false, + .weight_compress_support = false, +}; + +void nvdla_reg_write(void *driver_context, uint32_t addr, uint32_t reg) +{ + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)driver_context; + + if (!nvdla_dev) + return; + + writel(reg, nvdla_dev->base + addr); +} + +uint32_t nvdla_reg_read(void *driver_context, uint32_t addr) +{ + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)driver_context; + + if (!nvdla_dev) + return 0; + + return readl(nvdla_dev->base + addr); +} + +static irqreturn_t nvdla_engine_isr(int32_t irq, void *data) +{ + unsigned long flags; + uint32_t mask; + uint32_t reg; + struct nvdla_processor *processor = NULL; + struct nvdla_processor_group *group; + struct nvdla_engine *engine; + struct nvdla_device *nvdla_dev = (struct nvdla_device *)data; + + if (!nvdla_dev) + return IRQ_NONE; + + engine = nvdla_dev->engine_context; + spin_lock_irqsave(&nvdla_dev->nvdla_lock, flags); + + mask = glb_reg_read(engine, S_INTR_MASK); + reg = glb_reg_read(engine, S_INTR_STATUS); + + if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS0)) { + processor = &engine->processors[NVDLA_OP_CONV]; + group = &processor->groups[0]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, CACC_DONE_STATUS1)) { + processor = &engine->processors[NVDLA_OP_CONV]; + group = &processor->groups[1]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS0)) { + processor = &engine->processors[NVDLA_OP_SDP]; + group = &processor->groups[0]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, SDP_DONE_STATUS1)) { + processor = &engine->processors[NVDLA_OP_SDP]; + group = &processor->groups[1]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS0)) { + processor = &engine->processors[NVDLA_OP_CDP]; + group = &processor->groups[0]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, CDP_DONE_STATUS1)) { + processor = &engine->processors[NVDLA_OP_CDP]; + group = &processor->groups[1]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS0)) { + processor = &engine->processors[NVDLA_OP_RUBIK]; + group = &processor->groups[0]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, RUBIK_DONE_STATUS1)) { + processor = &engine->processors[NVDLA_OP_RUBIK]; + group = &processor->groups[1]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS0)) { + processor = &engine->processors[NVDLA_OP_PDP]; + group = &processor->groups[0]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, PDP_DONE_STATUS1)) { + processor = &engine->processors[NVDLA_OP_PDP]; + group = &processor->groups[1]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS0)) { + processor = &engine->processors[NVDLA_OP_BDMA]; + group = &processor->groups[0]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, BDMA_DONE_STATUS1)) { + processor = &engine->processors[NVDLA_OP_BDMA]; + group = &processor->groups[1]; + group->events |= (1 << NVDLA_EVENT_OP_COMPLETED); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS0)) { + processor = &engine->processors[NVDLA_OP_CONV]; + group = &processor->groups[0]; + group->events |= (1 << NVDLA_EVENT_CDMA_DT_DONE); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_DAT_DONE_STATUS1)) { + processor = &engine->processors[NVDLA_OP_CONV]; + group = &processor->groups[1]; + group->events |= (1 << NVDLA_EVENT_CDMA_DT_DONE); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS0)) { + processor = &engine->processors[NVDLA_OP_CONV]; + group = &processor->groups[0]; + group->events |= (1 << NVDLA_EVENT_CDMA_WT_DONE); + } + if (reg & MASK(GLB_S_INTR_STATUS_0, CDMA_WT_DONE_STATUS1)) { + processor = &engine->processors[NVDLA_OP_CONV]; + group = &processor->groups[1]; + group->events |= (1 << NVDLA_EVENT_CDMA_WT_DONE); + } + + glb_reg_write(engine, S_INTR_STATUS, reg); + mask = glb_reg_read(engine, S_INTR_MASK); + reg = glb_reg_read(engine, S_INTR_STATUS); + + complete(&nvdla_dev->event_notifier); + spin_unlock_irqrestore(&nvdla_dev->nvdla_lock, flags); + + return IRQ_HANDLED; +} + +static int32_t nvdla_read_dma_address(void *driver_context, void *task_data, + int16_t index, void *dst) +{ + int32_t ret = 0; + struct nvdla_mem_handle *handles; + dma_addr_t *phys_addr = (dma_addr_t *)(dst); + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)driver_context; + struct nvdla_task *task = (struct nvdla_task *)task_data; + + if (index == -1 || index > task->num_addresses) + return -EINVAL; + + handles = (struct nvdla_mem_handle *)task->address_list; + ret = nvdla_gem_dma_addr(nvdla_dev->drm, task->file, + handles[index].handle, + phys_addr); + + /* Add offset to IOVA address */ + *phys_addr = *phys_addr + handles[index].offset; + + return ret; +} + +static int32_t nvdla_read_cpu_address(void *driver_context, void *task_data, + int16_t index, void *dst) +{ + uint64_t *temp = (uint64_t *)dst; + struct nvdla_task *task = (struct nvdla_task *)task_data; + + if (index == -1 || index > task->num_addresses) + return -EINVAL; + + *temp = (uint64_t)index; + return 0; +} + +int32_t nvdla_get_dma_address(void *driver_context, void *task_data, + int16_t index, void *dst_ptr, + uint32_t destination) +{ + int32_t ret = 0; + + if (destination == DESTINATION_PROCESSOR) { + ret = nvdla_read_cpu_address(driver_context, task_data, + index, dst_ptr); + } else if (destination == DESTINATION_DMA) { + ret = nvdla_read_dma_address(driver_context, task_data, + index, dst_ptr); + } else { + ret = -EINVAL; + } + + return ret; +} + +int32_t nvdla_data_write(void *driver_context, void *task_data, + void *src, uint64_t dst, + uint32_t size, uint64_t offset) +{ + int32_t ret; + struct dma_buf *buf; + struct iosys_map map; + struct nvdla_mem_handle *handles; + struct nvdla_task *task = (struct nvdla_task *)task_data; + struct nvdla_device *nvdla_dev = (struct nvdla_device *)driver_context; + + handles = task->address_list; + buf = dma_buf_get(handles[dst].handle); + if (IS_ERR(buf)) { + drm_err(nvdla_dev->drm, "Failed get dma_buf for handle=%d\n", + handles[dst].handle); + return -EFAULT; + } + + ret = dma_buf_begin_cpu_access(buf, DMA_BIDIRECTIONAL); + if (ret) + goto put_dma_buf; + + ret = dma_buf_vmap(buf, &map); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to vmap dma_buf for handle=%d err %d\n", + handles[dst].handle, ret); + goto end_cpu_access; + } + if (iosys_map_is_null(&map)) { + ret = -ENOMEM; + goto end_cpu_access; + } + + iosys_map_memcpy_to(&map, offset, src, size); + + dma_buf_vunmap(buf, &map); + +end_cpu_access: + dma_buf_end_cpu_access(buf, DMA_BIDIRECTIONAL); + +put_dma_buf: + dma_buf_put(buf); + + return ret; +} + +int32_t nvdla_data_read(void *driver_context, void *task_data, + uint64_t src, void *dst, + uint32_t size, uint64_t offset) +{ + int32_t ret; + struct dma_buf *buf; + struct iosys_map map; + struct nvdla_mem_handle *handles; + struct nvdla_task *task = (struct nvdla_task *)task_data; + struct nvdla_device *nvdla_dev = (struct nvdla_device *)driver_context; + + handles = task->address_list; + + buf = dma_buf_get(handles[src].handle); + if (IS_ERR(buf)) { + drm_err(nvdla_dev->drm, "Failed get dma_buf for handle=%d\n", + handles[src].handle); + return -EFAULT; + } + + ret = dma_buf_begin_cpu_access(buf, DMA_BIDIRECTIONAL); + if (ret) + goto put_dma_buf; + ret = dma_buf_vmap(buf, &map); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to vmap dma_buf for handle=%d red %d\n", + handles[src].handle, ret); + goto end_cpu_access; + } + if (iosys_map_is_null(&map)) { + ret = -ENOMEM; + goto end_cpu_access; + } + + iosys_map_memcpy_from(dst, &map, offset, size); + + dma_buf_vunmap(buf, &map); + +end_cpu_access: + dma_buf_end_cpu_access(buf, DMA_BIDIRECTIONAL); + +put_dma_buf: + dma_buf_put(buf); + + return ret; +} + +int32_t nvdla_task_submit(struct nvdla_device *nvdla_dev, struct nvdla_task *task) +{ + int32_t err = 0; + uint32_t task_complete = 0; + + //trace_printk("__nvdla_task_submit_entry\n"); + nvdla_dev->task = task; + + err = nvdla_execute_task(nvdla_dev->engine_context, (void *)task, nvdla_dev->config_data); + if (err) { + drm_err(nvdla_dev->drm, "Task execution failed\n"); + return err; + } + + drm_dbg(nvdla_dev->drm, "Wait for task complete\n"); + + while (1) { + unsigned long flags; + + wait_for_completion(&nvdla_dev->event_notifier); + + spin_lock_irqsave(&nvdla_dev->nvdla_lock, flags); + + err = nvdla_process_events(nvdla_dev->engine_context, &task_complete); + + spin_unlock_irqrestore(&nvdla_dev->nvdla_lock, flags); + + if (err || task_complete) + break; + } + + drm_dbg(nvdla_dev->drm, "Task complete\n"); + nvdla_clear_task(nvdla_dev->engine_context); + //trace_printk("__nvdla_task_submit_exit\n"); + + return err; +} + +static union nvdla_operation_container operation_desc[NVDLA_OP_NUM][NVDLA_NUM_GROUPS]; +static union nvdla_surface_container surface_desc[NVDLA_OP_NUM][NVDLA_NUM_GROUPS]; + +static struct nvdla_task_desc global_task; + +static struct nvdla_engine engine = { + .processors[NVDLA_OP_BDMA] = { + .name = "BDMA", + .op_type = NVDLA_OP_BDMA, + .program = nvdla_bdma_program, + .enable = nvdla_bdma_enable, + .set_producer = nvdla_bdma_set_producer, + .is_ready = nvdla_bdma_is_ready, + .dump_config = nvdla_bdma_dump_config, + .rdma_check = nvdla_bdma_rdma_check, + .consumer_ptr = 0, + .roi_index = 0, + .group_status = 0, + .rdma_status = 0, + .last_group = 1, + .groups[0] = { + .id = 0, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_BDMA][0], + .surface_desc = &surface_desc[NVDLA_OP_BDMA][0], + }, + .groups[1] = { + .id = 1, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_BDMA][1], + .surface_desc = &surface_desc[NVDLA_OP_BDMA][1], + }, + }, + .processors[NVDLA_OP_CONV] = { + .name = "Convolution", + .op_type = NVDLA_OP_CONV, + .program = nvdla_conv_program, + .enable = nvdla_conv_enable, + .set_producer = nvdla_conv_set_producer, + .is_ready = nvdla_conv_is_ready, + .dump_config = nvdla_conv_dump_config, + .rdma_check = nvdla_conv_rdma_check, + .consumer_ptr = 0, + .roi_index = 0, + .group_status = 0, + .rdma_status = 0, + .last_group = 1, + .groups[0] = { + .id = 0, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_CONV][0], + .surface_desc = &surface_desc[NVDLA_OP_CONV][0], + }, + .groups[1] = { + .id = 1, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_CONV][1], + .surface_desc = &surface_desc[NVDLA_OP_CONV][1], + }, + }, + .processors[NVDLA_OP_SDP] = { + .name = "SDP", + .op_type = NVDLA_OP_SDP, + .program = nvdla_sdp_program, + .enable = nvdla_sdp_enable, + .set_producer = nvdla_sdp_set_producer, + .is_ready = nvdla_sdp_is_ready, + .dump_config = nvdla_sdp_dump_config, + .rdma_check = nvdla_sdp_rdma_check, + .consumer_ptr = 0, + .roi_index = 0, + .group_status = 0, + .rdma_status = 0, + .last_group = 1, + .groups[0] = { + .id = 0, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_SDP][0], + .surface_desc = &surface_desc[NVDLA_OP_SDP][0], + }, + .groups[1] = { + .id = 1, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_SDP][1], + .surface_desc = &surface_desc[NVDLA_OP_SDP][1], + }, + }, + .processors[NVDLA_OP_PDP] = { + .name = "PDP", + .op_type = NVDLA_OP_PDP, + .program = nvdla_pdp_program, + .enable = nvdla_pdp_enable, + .set_producer = nvdla_pdp_set_producer, + .is_ready = nvdla_pdp_is_ready, + .dump_config = nvdla_pdp_dump_config, + .rdma_check = nvdla_pdp_rdma_check, + .consumer_ptr = 0, + .roi_index = 0, + .group_status = 0, + .rdma_status = 0, + .last_group = 1, + .groups[0] = { + .id = 0, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_PDP][0], + .surface_desc = &surface_desc[NVDLA_OP_PDP][0], + }, + .groups[1] = { + .id = 1, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_PDP][1], + .surface_desc = &surface_desc[NVDLA_OP_PDP][1], + }, + }, + .processors[NVDLA_OP_CDP] = { + .name = "CDP", + .op_type = NVDLA_OP_CDP, + .program = nvdla_cdp_program, + .enable = nvdla_cdp_enable, + .set_producer = nvdla_cdp_set_producer, + .is_ready = nvdla_cdp_is_ready, + .dump_config = nvdla_cdp_dump_config, + .rdma_check = nvdla_cdp_rdma_check, + .consumer_ptr = 0, + .roi_index = 0, + .group_status = 0, + .rdma_status = 0, + .last_group = 1, + .groups[0] = { + .id = 0, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_CDP][0], + .surface_desc = &surface_desc[NVDLA_OP_CDP][0], + }, + .groups[1] = { + .id = 1, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_CDP][1], + .surface_desc = &surface_desc[NVDLA_OP_CDP][1], + }, + }, + + .processors[NVDLA_OP_RUBIK] = { + .name = "RUBIK", + .op_type = NVDLA_OP_RUBIK, + .program = nvdla_rubik_program, + .enable = nvdla_rubik_enable, + .set_producer = nvdla_rubik_set_producer, + .is_ready = nvdla_rubik_is_ready, + .dump_config = nvdla_rubik_dump_config, + .rdma_check = nvdla_rubik_rdma_check, + .consumer_ptr = 0, + .roi_index = 0, + .group_status = 0, + .rdma_status = 0, + .last_group = 1, + .groups[0] = { + .id = 0, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_RUBIK][0], + .surface_desc = &surface_desc[NVDLA_OP_RUBIK][0], + }, + .groups[1] = { + .id = 1, + .rdma_id = 0, + .active = 0, + .events = 0, + .roi_index = 0, + .is_rdma_needed = 0, + .lut_index = -1, + .operation_desc = &operation_desc[NVDLA_OP_RUBIK][1], + .surface_desc = &surface_desc[NVDLA_OP_RUBIK][1], + }, + }, + +}; + +/* driver probe and init */ +static const struct of_device_id nvdla_of_match[] = { + { + .compatible = "nvidia,nvdla_os_initial", + .data = &nvdla_config_os_initial, + }, + { + .compatible = "nvidia,nvdla_2", + .data = &nvdla_config_small, + }, + { }, +}; + +static int32_t nvdla_probe(struct platform_device *pdev) +{ + int32_t err = 0; + struct resource *res; + struct nvdla_device *nvdla_dev; + struct device *dev = &pdev->dev; + const struct of_device_id *match; + + if (!pdev->dev.of_node) + return -EINVAL; + + match = of_match_device(nvdla_of_match, &pdev->dev); + if (!match) { + pr_err("Missing DT entry!\n"); + return -EINVAL; + } + + nvdla_dev = devm_kzalloc(dev, sizeof(*nvdla_dev), GFP_KERNEL); + if (!nvdla_dev) + return -ENOMEM; + + platform_set_drvdata(pdev, nvdla_dev); + nvdla_dev->pdev = pdev; + nvdla_dev->config_data = (struct nvdla_config *)match->data; + + init_completion(&nvdla_dev->event_notifier); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + nvdla_dev->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(nvdla_dev->base)) + return PTR_ERR(nvdla_dev->base); + + res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (!res) { + dev_err(&pdev->dev, "no irq resource\n"); + return -EINVAL; + } + nvdla_dev->irq = res->start; + + err = devm_request_irq(&pdev->dev, nvdla_dev->irq, + nvdla_engine_isr, 0, + dev_name(&pdev->dev), nvdla_dev); + if (err) + return err; + + nvdla_dev->engine_context = &engine; + engine.task = &global_task; + engine.driver_context = (void *)nvdla_dev; + engine.task->task_data = NULL; + + nvdla_init_op_cache(&engine); + nvdla_clear_task(nvdla_dev->engine_context); + + err = nvdla_drm_probe(nvdla_dev); + if (err) + dev_err(&pdev->dev, "failed to register drm device\n"); + + return err; +} + +static int32_t __exit nvdla_remove(struct platform_device *pdev) +{ + struct nvdla_device *nvdla_dev = dev_get_drvdata(&pdev->dev); + + nvdla_drm_remove(nvdla_dev); + + return 0; +} + +static struct platform_driver nvdla_driver = { + .probe = nvdla_probe, + .remove = __exit_p(nvdla_remove), + .driver = { + .owner = THIS_MODULE, + .name = "NVDLA", + .of_match_table = of_match_ptr(nvdla_of_match), + }, +}; +module_platform_driver(nvdla_driver); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Nvidia Deep Learning Accelerator driver"); +MODULE_IMPORT_NS(DMA_BUF); diff --git a/drivers/gpu/drm/nvdla/nvdla_drv.h b/drivers/gpu/drm/nvdla/nvdla_drv.h new file mode 100644 index 000000000000..718d2c37ee94 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_drv.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION. + * Copyright (C) 2022 Cai Huoqing + */ + +#ifndef __NVDLA_DRV_H_ +#define __NVDLA_DRV_H_ + +#include <linux/completion.h> +#include <linux/device.h> +#include <linux/kref.h> +#include <linux/platform_device.h> +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/bits.h> +#include <linux/types.h> +#include <drm/drm_drv.h> +#include <drm/drm_print.h> + +#include "nvdla_engine.h" + +#define ASSERT_GOTO(_condition, _ret, _err_value, _goto) \ +do { \ + if (!(_condition)) { \ + _ret = _err_value; \ + pr_err("%s:%d failed %d\n", __func__, __LINE__, _ret); \ + goto _goto; \ + } else { \ + _ret = 0; \ + } \ +} while (0) + +/** + * @brief Task information submitted from user space + * + * ref Reference count for task + * num_addresses Number of addresses in address list + * nvdla_dev Pointer to NVDLA device + * address_list Address list + * file DRM file instance + */ +struct nvdla_task { + struct kref ref; + uint32_t num_addresses; + struct nvdla_device *nvdla_dev; + struct nvdla_mem_handle *address_list; + struct drm_file *file; +}; + +/** + * @brief Configuration parameters supported by the engine + * + * atom_size Memory smallest access size + * bdma_enable Defines whether bdma is supported + * rubik_enable Defines whether rubik is supported + * weight_compress_support Defines whether weight data compression is supported + */ +struct nvdla_config { + uint32_t atom_size; + bool bdma_enable; + bool rubik_enable; + bool weight_compress_support; +}; + +/** + * @brief NVDLA device + * + * irq Interrupt number associated with this device + * ref Reference count for device + * base IO mapped base address for device + * nvdla_lock Spinlock used for synchronization + * drm DRM device instance + * task Pointer to task in execution + * config_data Pointer to the configuration data + * pdev Pointer to NVDLA platform device + * event_notifier Completion object used to wait for events from HW + * engine_context Private data passed from engine in nvdla_engine_init + */ +struct nvdla_device { + int32_t irq; + struct kref ref; + void __iomem *base; + spinlock_t nvdla_lock; + struct drm_device *drm; + struct nvdla_task *task; + struct nvdla_config *config_data; + struct platform_device *pdev; + struct completion event_notifier; + + struct nvdla_engine *engine_context; +}; + +int32_t nvdla_task_submit(struct nvdla_device *nvdla_dev, struct nvdla_task *task); + +int32_t nvdla_gem_dma_addr(struct drm_device *dev, struct drm_file *file, + uint32_t fd, dma_addr_t *addr); + +int32_t nvdla_drm_probe(struct nvdla_device *nvdla_dev); + +void nvdla_drm_remove(struct nvdla_device *nvdla_dev); + +int32_t nvdla_process_events(struct nvdla_engine *engine_context, uint32_t *task_complete); + +void nvdla_clear_task(struct nvdla_engine *engine_context); + +int32_t nvdla_execute_task(struct nvdla_engine *engine_context, void *task_data, void *config_data); + +uint32_t nvdla_reg_read(void *driver_context, uint32_t addr); + +void nvdla_reg_write(void *driver_context, uint32_t addr, uint32_t reg); + +int32_t nvdla_data_read(void *driver_context, void *task_data, + uint64_t src, void *dst, + uint32_t size, uint64_t offset); + +int32_t nvdla_data_write(void *driver_context, void *task_data, + void *src, uint64_t dst, + uint32_t size, uint64_t offset); + +/* Destination for DMA buffer */ +#define DESTINATION_PROCESSOR 0 +#define DESTINATION_DMA 1 + +int32_t nvdla_get_dma_address(void *driver_context, void *task_data, + int16_t index, void *dst_ptr, + uint32_t destination); + +#endif diff --git a/drivers/gpu/drm/nvdla/nvdla_engine.c b/drivers/gpu/drm/nvdla/nvdla_engine.c new file mode 100644 index 000000000000..0b6553bc83c4 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_engine.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_common.h" +#include "nvdla_drv.h" +#include "nvdla_reg.h" +#include "nvdla_engine.h" + +static const uint32_t map_rdma_ptr_addr[] = { + 0xFFFFFFFF, + 0xFFFFFFFF, + SDP_REG(RDMA_S_POINTER), + PDP_REG(RDMA_S_POINTER), + CDP_REG(RDMA_S_POINTER), + 0xFFFFFFFF, +}; + +static const uint32_t map_sts_addr[] = { + BDMA_REG(STATUS), + CACC_REG(S_STATUS), + SDP_REG(S_STATUS), + PDP_REG(S_STATUS), + CDP_REG(S_STATUS), + RBK_REG(S_STATUS), +}; + +static const uint32_t map_ptr_addr[] = { + BDMA_REG(STATUS), + CACC_REG(S_POINTER), + SDP_REG(S_POINTER), + PDP_REG(S_POINTER), + CDP_REG(S_POINTER), + RBK_REG(S_POINTER), +}; + +uint32_t reg_read(struct nvdla_engine *engine, uint32_t addr) +{ + return nvdla_reg_read(engine->driver_context, addr); +} + +void reg_write(struct nvdla_engine *engine, uint32_t addr, uint32_t reg) +{ + nvdla_reg_write(engine->driver_context, addr, reg); +} + +int32_t nvdla_enable_intr(struct nvdla_engine *engine, uint32_t mask) +{ + uint32_t reg = glb_reg_read(engine, S_INTR_MASK); + + reg = reg & (~mask); + glb_reg_write(engine, S_INTR_MASK, reg); + + return 0; +} + +uint8_t bdma_grp_sts[2] = { + FIELD_ENUM(BDMA_STATUS_0, IDLE, YES), + FIELD_ENUM(BDMA_STATUS_0, IDLE, YES) +}; + +struct nvdla_roi_desc roi_desc; + +/** + * Get DMA data cube address + */ +int32_t +nvdla_get_dma_cube_address(void *driver_context, void *task_data, + int16_t index, uint32_t offset, void *dst_ptr, + uint32_t destination) +{ + int32_t ret = 0; + uint64_t *pdst = (uint64_t *)dst_ptr; + ret = nvdla_get_dma_address(driver_context, task_data, index, + dst_ptr, destination); + if (ret) + goto exit; + + pdst[0] += offset; + +exit: + return ret; +} + +/** + * Read input buffer address + * + * For input layer, in case of static ROI this address is read + * from address list and index is specified in data cube. In case + * dynamic ROI, it has to be read depending on ROI information + * and using surface address + * + * For all other layers, this address is read from address list + * using index specified in data cube + */ +int +nvdla_read_input_address(struct nvdla_engine *engine, struct nvdla_data_cube *data, + uint64_t *address, int16_t op_index, + uint8_t roi_index, uint8_t bpp) +{ + uint64_t roi_desc_addr; + int32_t ret = -EINVAL; + + /** + * If memory type is HW then no address required + */ + if (data->type == NVDLA_MEM_HW) { + ret = 0; + goto exit; + } + + /** + * If address list index is not -1 means this address has to + * be read from address list + */ + if (data->address != -1) { + + /** + * But if other parameters indicate that this is input layer + * for dynamic ROI then it is an error + */ + if (engine->network->dynamic_roi && + engine->network->input_layer == op_index) + goto exit; + ret = nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + data->address, + data->offset, + (void *)address, + DESTINATION_DMA); + goto exit; + } + + /** + * Check if it is dynamic ROI and this is input layer + */ + if (engine->network->dynamic_roi && engine->network->input_layer == op_index) { + if (!engine->task->surface_addr) + goto exit; + + /* Calculate address of ROI descriptor in array */ + roi_desc_addr = engine->task->roi_array_addr; + + /* Read ROI descriptor */ + ret = nvdla_data_read(engine->driver_context, + engine->task->task_data, + roi_desc_addr, + (void *)&roi_desc, + sizeof(roi_desc), + sizeof(struct nvdla_roi_array_desc) + + roi_index * sizeof(struct nvdla_roi_desc)); + if (ret) + goto exit; + + /* Calculate ROI address */ + *address = engine->task->surface_addr; + *address += (roi_desc.top * data->line_stride) + + (bpp * roi_desc.left); + } + +exit: + return ret; +} + +int +utils_get_free_group(struct nvdla_engine *engine, struct nvdla_processor *processor, + uint8_t *group_id, uint8_t *rdma_id) +{ + int32_t ret = 0; + uint32_t pointer; + uint32_t hw_consumer_ptr; + uint32_t hw_rdma_ptr; + + hw_rdma_ptr = 0; + + if (processor->op_type == NVDLA_OP_BDMA) { + pointer = reg_read(engine, map_ptr_addr[processor->op_type]); + hw_consumer_ptr = ((pointer & MASK(BDMA_STATUS_0, GRP0_BUSY)) >> + SHIFT(BDMA_STATUS_0, GRP0_BUSY)) == + FIELD_ENUM(BDMA_STATUS_0, GRP0_BUSY, YES) ? + 1 : 0; + } else { + pointer = reg_read(engine, map_ptr_addr[processor->op_type]); + hw_consumer_ptr = (pointer & MASK(CDP_S_POINTER_0, CONSUMER)) >> + SHIFT(CDP_S_POINTER_0, CONSUMER); + + /** + * Read current consumer pointer for RDMA only if processor + * has RDMA module + */ + if (map_rdma_ptr_addr[processor->op_type] != 0xFFFFFFFF) { + pointer = + reg_read(engine, map_rdma_ptr_addr[processor->op_type]); + hw_rdma_ptr = (pointer & + MASK(CDP_S_POINTER_0, CONSUMER)) >> + SHIFT(CDP_S_POINTER_0, CONSUMER); + } + } + + /** + * If both processors are programmed then exit + */ + if (processor->group_status == 0x3) { + ret = -EBUSY; + goto exit; + } + + if (!processor->group_status) + /** + * If both groups are idle then use consumer pointer + */ + *group_id = hw_consumer_ptr; + else + /** + * Here it is assumed that only one group is idle or busy + * and hence right shift will work to get correct + * group id + */ + *group_id = !(processor->group_status >> 1); + + /** + * If both groups are idle then read group id from pointer + */ + if (!processor->rdma_status) + *rdma_id = hw_rdma_ptr; + else + *rdma_id = !(processor->rdma_status >> 1); + +exit: + return ret; +} diff --git a/drivers/gpu/drm/nvdla/nvdla_engine.h b/drivers/gpu/drm/nvdla/nvdla_engine.h new file mode 100644 index 000000000000..57b85679de79 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_engine.h @@ -0,0 +1,272 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION. + * Copyright (C) 2022 Cai Huoqing + */ + +#ifndef __NVDLA_ENGINE_H_ +#define __NVDLA_ENGINE_H_ + +#include "nvdla_common.h" +#include "nvdla_sched.h" +#include "nvdla_engine.h" +#include "nvdla_reg.h" + +/*********************************************************/ +/******************** Utilities **************************/ +/*********************************************************/ +#define MASK(reg, field) (reg##_##field##_FIELD) +#define FIELD_ENUM(r, f, e) (r##_##f##_##e) +#define SHIFT(reg, field) (reg##_##field##_SHIFT) + +#define GLB_REG(name) GLB_##name##_0 +#define MCIF_REG(name) MCIF_##name##_0 +#define CVIF_REG(name) CVIF_##name##_0 +#define BDMA_REG(name) BDMA_##name##_0 +#define CDMA_REG(name) CDMA_##name##_0 +#define CSC_REG(name) CSC_##name##_0 +#define CMAC_A_REG(name) CMAC_A_##name##_0 +#define CMAC_B_REG(name) CMAC_B_##name##_0 +#define CACC_REG(name) CACC_##name##_0 +#define SDP_RDMA_REG(name) SDP_RDMA_##name##_0 +#define SDP_REG(name) SDP_##name##_0 +#define PDP_RDMA_REG(name) PDP_RDMA_##name##_0 +#define PDP_REG(name) PDP_##name##_0 +#define CDP_RDMA_REG(name) CDP_RDMA_##name##_0 +#define CDP_REG(name) CDP_##name##_0 +#define RBK_REG(name) RBK_##name##_0 + +/* alias for register read for each sub-module */ +#define glb_reg_read(engine, reg) reg_read(engine, GLB_REG(reg)) +#define bdma_reg_read(engine, reg) reg_read(engine, BDMA_REG(reg)) +#define cdma_reg_read(engine, reg) reg_read(engine, CDMA_REG(reg)) +#define csc_reg_read(engine, reg) reg_read(engine, CSC_REG(reg)) +#define cmac_a_reg_read(engine, reg) reg_read(engine, CMAC_A_REG(reg)) +#define cmac_b_reg_read(engine, reg) reg_read(engine, CMAC_B_REG(reg)) +#define cacc_reg_read(engine, reg) reg_read(engine, CACC_REG(reg)) +#define sdp_rdma_reg_read(engine, reg) reg_read(engine, SDP_RDMA_REG(reg)) +#define sdp_reg_read(engine, reg) reg_read(engine, SDP_REG(reg)) +#define pdp_rdma_reg_read(engine, reg) reg_read(engine, PDP_RDMA_REG(reg)) +#define pdp_reg_read(engine, reg) reg_read(engine, PDP_REG(reg)) +#define cdp_rdma_reg_read(engine, reg) reg_read(engine, CDP_RDMA_REG(reg)) +#define cdp_reg_read(engine, reg) reg_read(engine, CDP_REG(reg)) +#define rubik_reg_read(engine, reg) reg_read(engine, RBK_REG(reg)) + +/* alias for register write for each sub-module */ +#define glb_reg_write(engine, reg, val) reg_write(engine, GLB_REG(reg), val) +#define bdma_reg_write(engine, reg, val) reg_write(engine, BDMA_REG(reg), val) +#define cdma_reg_write(engine, reg, val) reg_write(engine, CDMA_REG(reg), val) +#define csc_reg_write(engine, reg, val) reg_write(engine, CSC_REG(reg), val) +#define cmac_a_reg_write(engine, reg, val) reg_write(engine, CMAC_A_REG(reg), val) +#define cmac_b_reg_write(engine, reg, val) reg_write(engine, CMAC_B_REG(reg), val) +#define cacc_reg_write(engine, reg, val) reg_write(engine, CACC_REG(reg), val) +#define sdp_rdma_reg_write(engine, reg, val) reg_write(engine, SDP_RDMA_REG(reg), val) +#define sdp_reg_write(engine, reg, val) reg_write(engine, SDP_REG(reg), val) +#define pdp_rdma_reg_write(engine, reg, val) reg_write(engine, PDP_RDMA_REG(reg), val) +#define pdp_reg_write(engine, reg, val) reg_write(engine, PDP_REG(reg), val) +#define cdp_rdma_reg_write(engine, reg, val) reg_write(engine, CDP_RDMA_REG(reg), val) +#define cdp_reg_write(engine, reg, val) reg_write(engine, CDP_REG(reg), val) +#define rubik_reg_write(engine, reg, val) reg_write(engine, RBK_REG(reg), val) + +struct nvdla_processor_group { + uint8_t id; + uint8_t rdma_id; + uint8_t active; + uint8_t events; + uint8_t roi_index; + uint8_t is_rdma_needed; + uint8_t pending; + int32_t lut_index; + uint8_t programming; + + struct nvdla_common_op_desc *op_desc; + struct nvdla_common_op_desc *consumers[NVDLA_OP_NUM]; + struct nvdla_common_op_desc *fused_parent; + union nvdla_operation_container *operation_desc; + union nvdla_surface_container *surface_desc; +}; + +struct nvdla_processor { + const char *name; + uint8_t op_type; + uint8_t consumer_ptr; + uint8_t roi_index; + uint8_t group_status; + uint8_t rdma_status; + uint8_t last_group; + + struct nvdla_common_op_desc *tail_op; + struct nvdla_processor_group groups[NVDLA_NUM_GROUPS]; + struct nvdla_engine *engine; + + int (*is_ready)(struct nvdla_processor *processor, + struct nvdla_processor_group *group); + int (*enable)(struct nvdla_engine *engine, struct nvdla_processor_group *group); + int (*program)(struct nvdla_engine *engine, struct nvdla_processor_group *group); + void (*set_producer)(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_id); + void (*dump_config)(struct nvdla_processor_group *group); + void (*rdma_check)(struct nvdla_processor_group *group); +}; + +struct nvdla_engine { + struct nvdla_task_desc *task; + struct nvdla_config *config_data; + struct nvdla_network_desc *network; + struct nvdla_processor processors[NVDLA_OP_NUM]; + + uint16_t num_proc_hwl; + int32_t status; + + void *driver_context; +}; + +void reg_write(struct nvdla_engine *engine, uint32_t addr, uint32_t reg); +uint32_t reg_read(struct nvdla_engine *engine, uint32_t addr); + +/** + * Operation descriptor cache functions + */ +void +nvdla_put_op_desc(struct nvdla_engine *engine, struct nvdla_common_op_desc *op_desc); +struct nvdla_common_op_desc +*nvdla_get_op_desc(struct nvdla_engine *engine, struct nvdla_task_desc *task, + int16_t index, + uint8_t op_type, + uint8_t roi_index); +void +nvdla_get_refcount(struct nvdla_common_op_desc *op_desc); +void +nvdla_init_op_cache(struct nvdla_engine *engine); + +/** + * Operation completion handler + */ +int +nvdla_op_completion(struct nvdla_engine *engine, struct nvdla_processor *processor, + struct nvdla_processor_group *group); + +int32_t +nvdla_read_lut(struct nvdla_engine *engine, int16_t index, void *dst); +int +nvdla_enable_intr(struct nvdla_engine *engine, uint32_t mask); +int +utils_get_free_group(struct nvdla_engine *engine, struct nvdla_processor *processor, + uint8_t *group_id, + uint8_t *rdma_id); +int32_t +nvdla_get_dma_cube_address(void *driver_context, + void *task_data, + int16_t index, + uint32_t offset, + void *dst_ptr, + uint32_t destination); +int +nvdla_read_input_address(struct nvdla_engine *engine, struct nvdla_data_cube *data, + uint64_t *address, + int16_t op_index, + uint8_t roi_index, + uint8_t bpp); + +/** + * BDMA operations + */ +void +nvdla_bdma_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id); +int +nvdla_bdma_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_bdma_program(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_bdma_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group); +void +nvdla_bdma_dump_config(struct nvdla_processor_group *group); +void +nvdla_bdma_rdma_check(struct nvdla_processor_group *group); + +/** + * Convolution operations + */ +void +nvdla_conv_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id); +int +nvdla_conv_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_conv_program(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_conv_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group); +void +nvdla_conv_dump_config(struct nvdla_processor_group *group); +void +nvdla_conv_rdma_check(struct nvdla_processor_group *group); + +/** + * SDP operations + */ +void +nvdla_sdp_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id); +int +nvdla_sdp_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_sdp_program(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_sdp_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group); +void +nvdla_sdp_dump_config(struct nvdla_processor_group *group); +void +nvdla_sdp_rdma_check(struct nvdla_processor_group *group); + +/** + * PDP operations + */ +void +nvdla_pdp_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id); +int +nvdla_pdp_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_pdp_program(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_pdp_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group); +void +nvdla_pdp_dump_config(struct nvdla_processor_group *group); +void +nvdla_pdp_rdma_check(struct nvdla_processor_group *group); + +/** + * CDP operations + */ +void +nvdla_cdp_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id); +int +nvdla_cdp_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_cdp_program(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_cdp_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group); +void +nvdla_cdp_dump_config(struct nvdla_processor_group *group); +void +nvdla_cdp_rdma_check(struct nvdla_processor_group *group); + +/** + * RUBIK operations + */ +void +nvdla_rubik_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id); +int +nvdla_rubik_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_rubik_program(struct nvdla_engine *engine, struct nvdla_processor_group *group); +int +nvdla_rubik_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group); +void +nvdla_rubik_dump_config(struct nvdla_processor_group *group); +void +nvdla_rubik_rdma_check(struct nvdla_processor_group *group); + +#endif diff --git a/drivers/gpu/drm/nvdla/nvdla_gem.c b/drivers/gpu/drm/nvdla/nvdla_gem.c new file mode 100644 index 000000000000..1e75eed9b7b4 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_gem.c @@ -0,0 +1,358 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include <linux/dma-buf.h> +#include <drm/drm_drv.h> +#include <drm/drm_gem_cma_helper.h> +#include <drm/nvdla_drm.h> + +#include "nvdla_drv.h" + +#define to_nvdla_obj(x) container_of(x, struct nvdla_gem_object, object) + +struct nvdla_gem_object { + struct drm_gem_object object; + + struct iosys_map map; + + dma_addr_t dma_addr; + unsigned long dma_attrs; +}; + +static int32_t nvdla_fill_task_desc(struct nvdla_ioctl_submit_task *local_task, + struct nvdla_task *task) +{ + struct nvdla_mem_handle *handles; + + /* update task desc fields */ + task->num_addresses = local_task->num_addresses; + + handles = kzalloc(local_task->num_addresses * sizeof(struct nvdla_mem_handle), + GFP_KERNEL); + if (handles == NULL) + return -EFAULT; + + /* get user addresses list */ + if (copy_from_user(handles, + (void __user *)local_task->address_list, + (task->num_addresses * + sizeof(struct nvdla_mem_handle)))) { + pr_err("failed to copy address list from user ptr\n"); + kfree(handles); + return -EFAULT; + } + + task->address_list = handles; + + return 0; +} + +static int32_t nvdla_submit(struct drm_device *drm, void *arg, + struct drm_file *file) +{ + int32_t err = 0; + struct nvdla_task *task; + struct nvdla_ioctl_submit_task local_task; + struct nvdla_ioctl_submit_task __user *user_task; + struct nvdla_device *nvdla_dev = dev_get_drvdata(drm->dev); + struct nvdla_submit_args *args = + (struct nvdla_submit_args *)arg; + + user_task = (struct nvdla_ioctl_submit_task __user *) + (uintptr_t)args->tasks; + if (!user_task) + return -EINVAL; + + /* IOCTL copy descriptors */ + if (copy_from_user(&local_task, (void __user *)user_task, + (sizeof(*user_task)))) + return -EFAULT; + + task = kzalloc(sizeof(*task), GFP_KERNEL); + if (task == NULL) + return -EFAULT; + + nvdla_dev->task = task; + kref_init(&task->ref); + task->nvdla_dev = nvdla_dev; + task->file = file; + + /* update task desc fields */ + err = nvdla_fill_task_desc(&local_task, task); + if (err) + goto free_task_desc; + + err = nvdla_task_submit(nvdla_dev, task); + + kfree(task->address_list); + +free_task_desc: + kfree(task); + return err; +} + +static int32_t nvdla_gem_alloc(struct nvdla_gem_object *nobj) +{ + struct drm_gem_object *dobj = &nobj->object; + struct drm_device *drm = dobj->dev; + void *kvaddr; + + nobj->dma_attrs = DMA_ATTR_WRITE_COMBINE; + + kvaddr = dma_alloc_attrs(drm->dev, dobj->size, &nobj->dma_addr, + GFP_KERNEL, nobj->dma_attrs); + if (!kvaddr) + return -ENOMEM; + + iosys_map_set_vaddr(&nobj->map, kvaddr); + + return 0; +} + +static void nvdla_gem_free(struct nvdla_gem_object *nobj) +{ + struct drm_gem_object *dobj = &nobj->object; + struct drm_device *drm = dobj->dev; + + dma_free_attrs(drm->dev, dobj->size, nobj->map.vaddr, nobj->dma_addr, + nobj->dma_attrs); +} + +static void nvdla_gem_free_object(struct drm_gem_object *dobj) +{ + struct nvdla_gem_object *nobj; + + drm_gem_free_mmap_offset(dobj); + + nobj = to_nvdla_obj(dobj); + + nvdla_gem_free(nobj); + + kfree(nobj); +} + +static struct nvdla_gem_object * +nvdla_gem_create_object(struct drm_device *drm, uint32_t size) +{ + int32_t ret; + struct drm_gem_object *dobj; + struct nvdla_gem_object *nobj; + + size = round_up(size, PAGE_SIZE); + + nobj = kzalloc(sizeof(*nobj), GFP_KERNEL); + if (!nobj) + return ERR_PTR(-ENOMEM); + + dobj = &nobj->object; + + drm_gem_private_object_init(drm, dobj, size); + + ret = nvdla_gem_alloc(nobj); + if (ret) + goto free_nvdla_obj; + + return nobj; + +free_nvdla_obj: + kfree(nobj); + return ERR_PTR(ret); +} + +static struct sg_table* +nvdla_drm_gem_prime_get_sg_table(struct drm_gem_object *dobj) +{ + int32_t ret; + struct sg_table *sgt; + struct drm_device *drm = dobj->dev; + struct nvdla_gem_object *nobj = to_nvdla_obj(dobj); + + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) + return ERR_PTR(-ENOMEM); + + ret = dma_get_sgtable_attrs(drm->dev, sgt, nobj->map.vaddr, + nobj->dma_addr, dobj->size, + nobj->dma_attrs); + if (ret) { + DRM_ERROR("failed to allocate sgt, %d\n", ret); + kfree(sgt); + return ERR_PTR(ret); + } + + return sgt; +} + +static int nvdla_drm_gem_prime_vmap(struct drm_gem_object *obj, struct iosys_map *map) +{ + struct nvdla_gem_object *nobj = to_nvdla_obj(obj); + + *map = nobj->map; + + return 0; +} + +static void nvdla_drm_gem_prime_vunmap(struct drm_gem_object *obj, struct iosys_map *map) +{ + /* Nothing to do */ +} + +static int32_t nvdla_drm_gem_object_mmap(struct drm_gem_object *dobj, + struct vm_area_struct *vma) +{ + int32_t ret; + struct nvdla_gem_object *nobj = to_nvdla_obj(dobj); + struct drm_device *drm = dobj->dev; + + vma->vm_flags &= ~VM_PFNMAP; + vma->vm_pgoff -= drm_vma_node_start(&dobj->vma_node); + + ret = dma_mmap_attrs(drm->dev, vma, nobj->map.vaddr, nobj->dma_addr, + dobj->size, nobj->dma_attrs); + if (ret) + drm_gem_vm_close(vma); + + return ret; +} + +static const struct drm_gem_object_funcs nvdla_gem_object_funcs = { + .free = nvdla_gem_free_object, + .get_sg_table = nvdla_drm_gem_prime_get_sg_table, + .vmap = nvdla_drm_gem_prime_vmap, + .vunmap = nvdla_drm_gem_prime_vunmap, + .mmap = nvdla_drm_gem_object_mmap, +}; + +static struct nvdla_gem_object* +nvdla_gem_create_with_handle(struct drm_file *file_priv, + struct drm_device *drm, uint32_t size, + uint32_t *handle) +{ + int32_t ret; + struct drm_gem_object *dobj; + struct nvdla_gem_object *nobj; + + nobj = nvdla_gem_create_object(drm, size); + if (IS_ERR(nobj)) + return ERR_CAST(nobj); + + dobj = &nobj->object; + dobj->funcs = &nvdla_gem_object_funcs; + ret = drm_gem_handle_create(file_priv, dobj, handle); + if (ret) + goto free_drm_object; + + drm_gem_object_put(dobj); + + return nobj; + +free_drm_object: + nvdla_gem_free_object(dobj); + + return ERR_PTR(ret); +} + +static int32_t nvdla_gem_create(struct drm_device *drm, void *data, + struct drm_file *file) +{ + struct nvdla_gem_object *nobj; + struct nvdla_gem_create_args *args = data; + + nobj = nvdla_gem_create_with_handle(file, drm, args->size, + &args->handle); + if (IS_ERR(nobj)) + return PTR_ERR(nobj); + + return 0; +} + +int32_t nvdla_gem_dma_addr(struct drm_device *dev, struct drm_file *file, + uint32_t fd, dma_addr_t *addr) +{ + int32_t ret; + uint32_t handle; + struct nvdla_gem_object *nobj; + struct drm_gem_object *dobj; + + ret = drm_gem_prime_fd_to_handle(dev, file, fd, &handle); + if (ret) + return ret; + + dobj = drm_gem_object_lookup(file, handle); + if (!dobj) + return -EINVAL; + + nobj = to_nvdla_obj(dobj); + + *addr = nobj->dma_addr; + + drm_gem_object_put(dobj); + + return 0; +} + +static int32_t nvdla_gem_map_offset(struct drm_device *drm, void *data, + struct drm_file *file) +{ + struct nvdla_gem_map_offset_args *args = data; + + return drm_gem_dumb_map_offset(file, drm, args->handle, + &args->offset); +} + +DEFINE_DRM_GEM_FOPS(nvdla_drm_fops); + +static const struct drm_ioctl_desc nvdla_drm_ioctls[] = { + DRM_IOCTL_DEF_DRV(NVDLA_SUBMIT, nvdla_submit, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(NVDLA_GEM_CREATE, nvdla_gem_create, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(NVDLA_GEM_MMAP, nvdla_gem_map_offset, DRM_RENDER_ALLOW), + /* use DRM_IOCTL_MODE_DESTROY_DUMB to destroy */ +}; + +static struct drm_driver nvdla_drm_driver = { + .driver_features = DRIVER_GEM | DRIVER_RENDER, + + .ioctls = nvdla_drm_ioctls, + .num_ioctls = ARRAY_SIZE(nvdla_drm_ioctls), + .fops = &nvdla_drm_fops, + .gem_prime_mmap = drm_gem_prime_mmap, + + .name = "nvdla", + .desc = "NVDLA driver", + .date = "20171017", + .major = 0, + .minor = 0, + .patchlevel = 0, +}; + +int32_t nvdla_drm_probe(struct nvdla_device *nvdla_dev) +{ + int32_t err; + struct drm_device *drm; + struct drm_driver *driver = &nvdla_drm_driver; + + drm = drm_dev_alloc(driver, &nvdla_dev->pdev->dev); + if (IS_ERR(drm)) + return PTR_ERR(drm); + + nvdla_dev->drm = drm; + + err = drm_dev_register(drm, 0); + if (err < 0) + goto unref; + + return 0; + +unref: + drm_dev_put(drm); + return err; +} + +void nvdla_drm_remove(struct nvdla_device *nvdla_dev) +{ + drm_dev_unregister(nvdla_dev->drm); + drm_dev_put(nvdla_dev->drm); +} diff --git a/drivers/gpu/drm/nvdla/nvdla_pdp.c b/drivers/gpu/drm/nvdla/nvdla_pdp.c new file mode 100644 index 000000000000..a006c8d6fd57 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_pdp.c @@ -0,0 +1,448 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_drv.h" +#include "nvdla_reg.h" +#include "nvdla_common.h" +#include "nvdla_engine.h" + +#define MAX_SPLIT_NUM 64 +//#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a[0]))) + +static const uint8_t map_ram[] = { + FIELD_ENUM(PDP_RDMA_D_SRC_RAM_CFG_0, SRC_RAM_TYPE, MC), + FIELD_ENUM(PDP_RDMA_D_SRC_RAM_CFG_0, SRC_RAM_TYPE, CV), +}; + +static const uint8_t map_pool[] = { + FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0, + POOLING_METHOD, POOLING_METHOD_AVERAGE), + FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0, + POOLING_METHOD, POOLING_METHOD_MAX), + FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0, + POOLING_METHOD, POOLING_METHOD_MIN), +}; + +static const uint8_t map_precision[] = { + FIELD_ENUM(PDP_D_DATA_FORMAT_0, INPUT_DATA, INT8), + FIELD_ENUM(PDP_D_DATA_FORMAT_0, INPUT_DATA, INT16), + FIELD_ENUM(PDP_D_DATA_FORMAT_0, INPUT_DATA, FP16), +}; + +static const uint8_t map_pool_kernel[] = { + FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_1), + FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_2), + FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_3), + FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_4), + FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_5), + FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_6), + FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_7), + FIELD_ENUM(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH, KERNEL_WIDTH_8), +}; + +/* The reciprocal of kernel width: 1/1, 1/2, 1/3, ... */ +static const uint32_t recip_kernel_size[2][8] = { + /* + * INT8/16 + * 1 1/2 1/3 1/4 1/5 1/6 1/7 1/8 + */ + {0x10000, 0x8000, 0x5555, 0x4000, 0x3333, 0x2aaa, 0x2492, 0x2000}, + {0x7c00, 0x7800, 0x7555, 0x7400, 0x7266, 0x7155, 0x7092, 0x7000}, +}; + +static uint32_t +get_fly_mode(uint8_t type) +{ + uint32_t val; + + val = type == NVDLA_MEM_HW ? + FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0, + FLYING_MODE, ON_FLYING) : + FIELD_ENUM(PDP_D_OPERATION_MODE_CFG_0, + FLYING_MODE, OFF_FLYING); + + return val; +} + +void +nvdla_pdp_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id) +{ + uint32_t reg; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + drm_dbg(nvdla_dev->drm, "group id %d rdma id %d\n", group_id, rdma_group_id); + + reg = group_id << SHIFT(PDP_S_POINTER_0, PRODUCER); + pdp_reg_write(engine, S_POINTER, reg); + + reg = rdma_group_id << SHIFT(PDP_RDMA_S_POINTER_0, PRODUCER); + pdp_rdma_reg_write(engine, S_POINTER, reg); +} + +int +nvdla_pdp_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret = 0; + uint32_t reg; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + if (!group) { + ret = -EINVAL; + goto exit; + } + + drm_dbg(nvdla_dev->drm, "rdma needed %u\n", group->is_rdma_needed); + + /* enable all sub-modules */ + if (group->is_rdma_needed) { + reg = FIELD_ENUM(PDP_RDMA_D_OP_ENABLE_0, OP_EN, ENABLE); + pdp_rdma_reg_write(engine, D_OP_ENABLE, reg); + } + reg = FIELD_ENUM(PDP_D_OP_ENABLE_0, OP_EN, ENABLE); + pdp_reg_write(engine, D_OP_ENABLE, reg); + +exit: + return ret; +} + +void +nvdla_pdp_rdma_check(struct nvdla_processor_group *group) +{ + struct nvdla_pdp_surface_desc *pdp_surface; + + pdp_surface = &group->surface_desc->pdp_surface; + + group->is_rdma_needed = 0; + + if (pdp_surface->src_data.type != NVDLA_MEM_HW) + group->is_rdma_needed = 1; +} + +static int +validate_strides(uint8_t stride_x, uint8_t stride_y) +{ + int32_t ret = 0; + + if (stride_x < 1 || stride_y < 1 || stride_x > 8 || stride_y > 8) { + pr_err("Invalid Stride (x[%d], y[%d])\n", stride_x, stride_y); + ret = -EINVAL; + } + + return ret; +} + +static int +vaildate_pdp_configs(struct nvdla_processor_group *group) +{ + int32_t ret = 0; + struct nvdla_pdp_op_desc *pdp_op; + struct nvdla_pdp_surface_desc *pdp_surface; + + pdp_op = &group->operation_desc->pdp_op; + pdp_surface = &group->surface_desc->pdp_surface; + + if (pdp_surface->dst_data.type == NVDLA_MEM_HW) { + pr_err("Destination buffer for PDP has to be either MC or CV"); + ret = -EINVAL; + goto exit; + } + + ret = validate_data_cube(pdp_surface->src_data, pdp_surface->dst_data, + NVDLA_MEM_HW); + if (ret) + goto exit; + + ret = validate_precision(pdp_op->precision, ARRAY_SIZE(map_precision)); + if (ret) + goto exit; + + ret = validate_strides(pdp_op->stride_x, pdp_op->stride_y); + if (ret) + goto exit; + + if (pdp_op->split_num > MAX_SPLIT_NUM) { + pr_err("Invalid split_num: %u\n", pdp_op->split_num); + ret = -EINVAL; + goto exit; + } + + if (pdp_op->pool_width >= ARRAY_SIZE(map_pool_kernel)) { + pr_err("Invalid pool_width: %u\n", pdp_op->pool_width); + ret = -EINVAL; + goto exit; + } + + if (pdp_op->pool_height >= ARRAY_SIZE(map_pool_kernel)) { + pr_err("Invalid pool_height: %u\n", pdp_op->pool_height); + ret = -EINVAL; + goto exit; + } + + if (pdp_op->pool_mode >= ARRAY_SIZE(map_pool)) { + pr_err("Invalid pool_mode: %u\n", pdp_op->pool_mode); + ret = -EINVAL; + goto exit; + } + +exit: + return ret; +} + +static int +processor_pdp_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret = 0; + uint32_t reg, high, low; + uint64_t input_address = 0; + uint64_t output_address = 0; + struct nvdla_pdp_op_desc *pdp_op; + struct nvdla_pdp_surface_desc *pdp_surface; + + pdp_op = &group->operation_desc->pdp_op; + pdp_surface = &group->surface_desc->pdp_surface; + + ret = vaildate_pdp_configs(group); + if (ret) + goto exit; + + ret = nvdla_read_input_address(engine, &pdp_surface->src_data, + &input_address, + group->op_desc->index, + group->roi_index, + 1); + if (ret) + goto exit; + + if (pdp_surface->dst_data.address != -1) + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + pdp_surface->dst_data.address, + pdp_surface->dst_data.offset, + (void *)&output_address, + DESTINATION_DMA); + + if (pdp_surface->src_data.type != NVDLA_MEM_HW) { + /* PDP RDMA */ + pdp_rdma_reg_write(engine, D_DATA_CUBE_IN_WIDTH, + pdp_surface->src_data.width - 1); + pdp_rdma_reg_write(engine, D_DATA_CUBE_IN_HEIGHT, + pdp_surface->src_data.height - 1); + pdp_rdma_reg_write(engine, D_DATA_CUBE_IN_CHANNEL, + pdp_surface->src_data.channel - 1); + + high = upper_32_bits(input_address); + low = lower_32_bits(input_address); + pdp_rdma_reg_write(engine, D_SRC_BASE_ADDR_HIGH, high); + pdp_rdma_reg_write(engine, D_SRC_BASE_ADDR_LOW, low); + pdp_rdma_reg_write(engine, D_SRC_LINE_STRIDE, + pdp_surface->src_data.line_stride); + pdp_rdma_reg_write(engine, D_SRC_SURFACE_STRIDE, + pdp_surface->src_data.surf_stride); + + reg = (map_precision[pdp_op->precision] + << SHIFT(PDP_RDMA_D_DATA_FORMAT_0, INPUT_DATA)); + pdp_rdma_reg_write(engine, D_DATA_FORMAT, reg); + + reg = map_ram[pdp_surface->src_data.type] + << SHIFT(PDP_RDMA_D_SRC_RAM_CFG_0, SRC_RAM_TYPE); + pdp_rdma_reg_write(engine, D_SRC_RAM_CFG, reg); + + reg = ((pdp_op->split_num - 1) + << SHIFT(PDP_RDMA_D_OPERATION_MODE_CFG_0, SPLIT_NUM)); + pdp_rdma_reg_write(engine, D_OPERATION_MODE_CFG, reg); + + reg = (map_pool_kernel[pdp_op->pool_width] + << SHIFT(PDP_RDMA_D_POOLING_KERNEL_CFG_0, + KERNEL_WIDTH)) | + ((pdp_op->stride_x - 1) + << SHIFT(PDP_RDMA_D_POOLING_KERNEL_CFG_0, + KERNEL_STRIDE_WIDTH)); + pdp_rdma_reg_write(engine, D_POOLING_KERNEL_CFG, reg); + + reg = (pdp_op->pad_left + << SHIFT(PDP_RDMA_D_POOLING_PADDING_CFG_0, PAD_WIDTH)); + pdp_rdma_reg_write(engine, D_POOLING_PADDING_CFG, reg); + + reg = ((pdp_op->partial_in_width_first == 0 ? 0 : + pdp_op->partial_in_width_first - 1) + << SHIFT(PDP_RDMA_D_PARTIAL_WIDTH_IN_0, + PARTIAL_WIDTH_IN_FIRST)) | + ((pdp_op->partial_in_width_mid == 0 ? 0 : + pdp_op->partial_in_width_mid - 1) + << SHIFT(PDP_RDMA_D_PARTIAL_WIDTH_IN_0, + PARTIAL_WIDTH_IN_MID)) | + ((pdp_op->partial_in_width_last == 0 ? 0 : + pdp_op->partial_in_width_last - 1) + << SHIFT(PDP_RDMA_D_PARTIAL_WIDTH_IN_0, + PARTIAL_WIDTH_IN_LAST)); + pdp_rdma_reg_write(engine, D_PARTIAL_WIDTH_IN, reg); + } else { + ASSERT_GOTO(pdp_op->split_num == 1, ret, + -EINVAL, exit); + } + + reg = ((pdp_surface->src_data.width - 1) + << SHIFT(PDP_D_DATA_CUBE_IN_WIDTH_0, CUBE_IN_WIDTH)); + pdp_reg_write(engine, D_DATA_CUBE_IN_WIDTH, reg); + + reg = ((pdp_surface->src_data.height - 1) + << SHIFT(PDP_D_DATA_CUBE_IN_HEIGHT_0, CUBE_IN_HEIGHT)); + pdp_reg_write(engine, D_DATA_CUBE_IN_HEIGHT, reg); + + reg = ((pdp_surface->src_data.channel - 1) + << SHIFT(PDP_D_DATA_CUBE_IN_CHANNEL_0, CUBE_IN_CHANNEL)); + pdp_reg_write(engine, D_DATA_CUBE_IN_CHANNEL, reg); + + reg = ((pdp_surface->dst_data.width - 1) + << SHIFT(PDP_D_DATA_CUBE_OUT_WIDTH_0, CUBE_OUT_WIDTH)); + pdp_reg_write(engine, D_DATA_CUBE_OUT_WIDTH, reg); + + reg = ((pdp_surface->dst_data.height - 1) + << SHIFT(PDP_D_DATA_CUBE_OUT_HEIGHT_0, CUBE_OUT_HEIGHT)); + pdp_reg_write(engine, D_DATA_CUBE_OUT_HEIGHT, reg); + + reg = ((pdp_surface->dst_data.channel - 1) + << SHIFT(PDP_D_DATA_CUBE_OUT_CHANNEL_0, CUBE_OUT_CHANNEL)); + pdp_reg_write(engine, D_DATA_CUBE_OUT_CHANNEL, reg); + + reg = (map_pool[pdp_op->pool_mode] + << SHIFT(PDP_D_OPERATION_MODE_CFG_0, POOLING_METHOD)) | + (get_fly_mode(pdp_surface->src_data.type) + << SHIFT(PDP_D_OPERATION_MODE_CFG_0, FLYING_MODE)) | + ((pdp_op->split_num - 1) + << SHIFT(PDP_D_OPERATION_MODE_CFG_0, SPLIT_NUM)); + pdp_reg_write(engine, D_OPERATION_MODE_CFG, reg); + + reg = ((pdp_op->partial_in_width_first == 0 ? 0 : + pdp_op->partial_in_width_first-1) + << SHIFT(PDP_D_PARTIAL_WIDTH_IN_0, PARTIAL_WIDTH_IN_FIRST)) | + ((pdp_op->partial_in_width_mid == 0 ? 0 : + pdp_op->partial_in_width_mid-1) + << SHIFT(PDP_D_PARTIAL_WIDTH_IN_0, PARTIAL_WIDTH_IN_MID)) | + ((pdp_op->partial_in_width_last == 0 ? 0 : + pdp_op->partial_in_width_last-1) + << SHIFT(PDP_D_PARTIAL_WIDTH_IN_0, PARTIAL_WIDTH_IN_LAST)); + pdp_reg_write(engine, D_PARTIAL_WIDTH_IN, reg); + + reg = ((pdp_op->partial_width_first == 0 ? 0 : + pdp_op->partial_width_first-1) + << SHIFT(PDP_D_PARTIAL_WIDTH_OUT_0, PARTIAL_WIDTH_OUT_FIRST)) | + ((pdp_op->partial_width_mid == 0 ? 0 : + pdp_op->partial_width_mid-1) + << SHIFT(PDP_D_PARTIAL_WIDTH_OUT_0, PARTIAL_WIDTH_OUT_MID)) | + ((pdp_op->partial_width_last == 0 ? 0 : + pdp_op->partial_width_last-1) + << SHIFT(PDP_D_PARTIAL_WIDTH_OUT_0, PARTIAL_WIDTH_OUT_LAST)); + pdp_reg_write(engine, D_PARTIAL_WIDTH_OUT, reg); + + reg = (map_pool_kernel[pdp_op->pool_width] + << SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_WIDTH)) | + (map_pool_kernel[pdp_op->pool_height] + << SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_HEIGHT))| + ((pdp_op->stride_x - 1) + << SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_STRIDE_WIDTH)) | + ((pdp_op->stride_y - 1) + << SHIFT(PDP_D_POOLING_KERNEL_CFG_0, KERNEL_STRIDE_HEIGHT)); + pdp_reg_write(engine, D_POOLING_KERNEL_CFG, reg); + + pdp_reg_write(engine, D_RECIP_KERNEL_WIDTH, + recip_kernel_size[pdp_op->precision == + PRECISION_FP16][pdp_op->pool_width]); + pdp_reg_write(engine, D_RECIP_KERNEL_HEIGHT, + recip_kernel_size[pdp_op->precision == + PRECISION_FP16][pdp_op->pool_height]); + + reg = (pdp_op->pad_left + << SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_LEFT)) | + (pdp_op->pad_right + << SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_RIGHT)) | + (pdp_op->pad_top + << SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_TOP)) | + (pdp_op->pad_bottom + << SHIFT(PDP_D_POOLING_PADDING_CFG_0, PAD_BOTTOM)); + if (pdp_op->precision == PRECISION_FP16) { + int32_t i; + + for (i = 0; i < 7; i++) + ASSERT_GOTO(pdp_op->padding_value[i] == 0, ret, + -EINVAL, exit); + } + + pdp_reg_write(engine, D_POOLING_PADDING_CFG, reg); + pdp_reg_write(engine, D_POOLING_PADDING_VALUE_1_CFG, pdp_op->padding_value[0]); + pdp_reg_write(engine, D_POOLING_PADDING_VALUE_2_CFG, pdp_op->padding_value[1]); + pdp_reg_write(engine, D_POOLING_PADDING_VALUE_3_CFG, pdp_op->padding_value[2]); + pdp_reg_write(engine, D_POOLING_PADDING_VALUE_4_CFG, pdp_op->padding_value[3]); + pdp_reg_write(engine, D_POOLING_PADDING_VALUE_5_CFG, pdp_op->padding_value[4]); + pdp_reg_write(engine, D_POOLING_PADDING_VALUE_6_CFG, pdp_op->padding_value[5]); + pdp_reg_write(engine, D_POOLING_PADDING_VALUE_7_CFG, pdp_op->padding_value[6]); + + if (pdp_surface->src_data.type != NVDLA_MEM_HW) { + pdp_reg_write(engine, D_SRC_LINE_STRIDE, + pdp_surface->src_data.line_stride); + pdp_reg_write(engine, D_SRC_SURFACE_STRIDE, + pdp_surface->src_data.surf_stride); + } + + high = upper_32_bits(output_address); + low = lower_32_bits(output_address); + pdp_reg_write(engine, D_DST_BASE_ADDR_LOW, low); + pdp_reg_write(engine, D_DST_BASE_ADDR_HIGH, high); + + pdp_reg_write(engine, D_DST_LINE_STRIDE, pdp_surface->dst_data.line_stride); + pdp_reg_write(engine, D_DST_SURFACE_STRIDE, pdp_surface->dst_data.surf_stride); + + reg = (map_ram[pdp_surface->dst_data.type] + << SHIFT(PDP_D_DST_RAM_CFG_0, DST_RAM_TYPE)); + pdp_reg_write(engine, D_DST_RAM_CFG, reg); + + reg = (map_precision[pdp_op->precision] + << SHIFT(PDP_D_DATA_FORMAT_0, INPUT_DATA)); + pdp_reg_write(engine, D_DATA_FORMAT, reg); + +exit: + return ret; +} + +int +nvdla_pdp_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group) +{ + return 1; +} + +void +nvdla_pdp_dump_config(struct nvdla_processor_group *group) +{ + struct nvdla_pdp_op_desc *pdp_op; + struct nvdla_pdp_surface_desc *pdp_surface; + + pdp_surface = &group->surface_desc->pdp_surface; + pdp_op = &group->operation_desc->pdp_op; +} + +int +nvdla_pdp_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret; + + if (!group) { + ret = -EINVAL; + goto exit; + } + + nvdla_enable_intr(engine, MASK(GLB_S_INTR_MASK_0, PDP_DONE_MASK1) | + MASK(GLB_S_INTR_MASK_0, PDP_DONE_MASK0)); + + ret = processor_pdp_program(engine, group); + if (ret) + goto exit; + +exit: + return ret; +} diff --git a/drivers/gpu/drm/nvdla/nvdla_rubik.c b/drivers/gpu/drm/nvdla/nvdla_rubik.c new file mode 100644 index 000000000000..ccf6108c3531 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_rubik.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_common.h" +#include "nvdla_drv.h" +#include "nvdla_reg.h" +#include "nvdla_engine.h" + +static uint8_t map_rubik_mode[] = { + FIELD_ENUM(RBK_D_MISC_CFG_0, RUBIK_MODE, CONTRACT), + FIELD_ENUM(RBK_D_MISC_CFG_0, RUBIK_MODE, SPLIT), + FIELD_ENUM(RBK_D_MISC_CFG_0, RUBIK_MODE, MERGE), +}; + +static uint8_t map_ram_type[] = { + FIELD_ENUM(RBK_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, MCIF), + FIELD_ENUM(RBK_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE, CVIF), +}; + +static uint8_t map_precision[] = { + FIELD_ENUM(RBK_D_MISC_CFG_0, IN_PRECISION, INT8), + FIELD_ENUM(RBK_D_MISC_CFG_0, IN_PRECISION, INT16), + FIELD_ENUM(RBK_D_MISC_CFG_0, IN_PRECISION, FP16), +}; + +static uint8_t map_bpe[] = { + BPE_PRECISION_INT8, + BPE_PRECISION_INT16, + BPE_PRECISION_FP16, +}; + +void +nvdla_rubik_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t __unused) +{ + uint32_t reg; + + /* set producer pointer for all sub-modules */ + reg = group_id << SHIFT(RBK_S_POINTER_0, PRODUCER); + rubik_reg_write(engine, S_POINTER, reg); +} + +int +nvdla_rubik_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + uint32_t reg; + + reg = FIELD_ENUM(RBK_D_OP_ENABLE_0, OP_EN, ENABLE); + rubik_reg_write(engine, D_OP_ENABLE, reg); + + return 0; +} + +void +nvdla_rubik_rdma_check(struct nvdla_processor_group *group) +{ + group->is_rdma_needed = 0; +} + +static int32_t +processor_rubik_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret = 0; + uint32_t reg, high, low; + uint64_t input_address = 0; + uint64_t output_address = 0; + struct nvdla_rubik_op_desc *rubik_op; + struct nvdla_rubik_surface_desc *rubik_surface; + + rubik_op = &group->operation_desc->rubik_op; + rubik_surface = &group->surface_desc->rubik_surface; + + /* Argument check */ + ASSERT_GOTO((rubik_surface->src_data.type != NVDLA_MEM_HW), + ret, -EINVAL, exit); + ASSERT_GOTO((rubik_surface->dst_data.type != NVDLA_MEM_HW), + ret, -EINVAL, exit); + + /* get the addresses from task descriptor */ + ret = nvdla_read_input_address(engine, &rubik_surface->src_data, + &input_address, + group->op_desc->index, + group->roi_index, + 1); + if (ret) + goto exit; + + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + rubik_surface->dst_data.address, + rubik_surface->dst_data.offset, + (void *)&output_address, + DESTINATION_DMA); + + /* config rubik */ + reg = (((uint32_t)map_rubik_mode[rubik_op->mode]) << + SHIFT(RBK_D_MISC_CFG_0, RUBIK_MODE)) | + (((uint32_t)map_precision[rubik_op->precision]) << + SHIFT(RBK_D_MISC_CFG_0, IN_PRECISION)); + rubik_reg_write(engine, D_MISC_CFG, reg); + reg = (((uint32_t)map_ram_type[rubik_surface->src_data.type]) << + SHIFT(RBK_D_DAIN_RAM_TYPE_0, DATAIN_RAM_TYPE)); + rubik_reg_write(engine, D_DAIN_RAM_TYPE, reg); + reg = ((rubik_surface->src_data.width-1) << + SHIFT(RBK_D_DATAIN_SIZE_0_0, DATAIN_WIDTH)) | + ((rubik_surface->src_data.height-1) << + SHIFT(RBK_D_DATAIN_SIZE_0_0, DATAIN_HEIGHT)); + rubik_reg_write(engine, D_DATAIN_SIZE_0, reg); + reg = ((rubik_surface->src_data.channel-1) << + SHIFT(RBK_D_DATAIN_SIZE_1_0, DATAIN_CHANNEL)); + rubik_reg_write(engine, D_DATAIN_SIZE_1, reg); + + high = upper_32_bits(input_address); + low = lower_32_bits(input_address); + rubik_reg_write(engine, D_DAIN_ADDR_LOW, low); + rubik_reg_write(engine, D_DAIN_ADDR_HIGH, high); + if (rubik_op->mode == RUBIK_MODE_MERGE) { + ASSERT_GOTO((rubik_surface->src_data.plane_stride != 0), + ret, -EINVAL, exit); + ASSERT_GOTO(((rubik_surface->src_data.plane_stride&0x1F) == 0), + ret, -EINVAL, exit); + rubik_reg_write(engine, D_DAIN_PLANAR_STRIDE, + rubik_surface->src_data.plane_stride); + } else { + rubik_reg_write(engine, D_DAIN_SURF_STRIDE, + rubik_surface->src_data.surf_stride); + } + rubik_reg_write(engine, D_DAIN_LINE_STRIDE, + rubik_surface->src_data.line_stride); + + reg = (((uint32_t)map_ram_type[rubik_surface->dst_data.type]) << + SHIFT(RBK_D_DAOUT_RAM_TYPE_0, DATAOUT_RAM_TYPE)); + rubik_reg_write(engine, D_DAOUT_RAM_TYPE, reg); + reg = ((rubik_surface->dst_data.channel-1) << + SHIFT(RBK_D_DATAOUT_SIZE_1_0, DATAOUT_CHANNEL)); + rubik_reg_write(engine, D_DATAOUT_SIZE_1, reg); + + high = upper_32_bits(output_address); + low = lower_32_bits(output_address); + rubik_reg_write(engine, D_DAOUT_ADDR_LOW, low); + rubik_reg_write(engine, D_DAOUT_ADDR_HIGH, high); + + rubik_reg_write(engine, D_DAOUT_LINE_STRIDE, + rubik_surface->dst_data.line_stride); + if (rubik_op->mode != RUBIK_MODE_SPLIT) { + rubik_reg_write(engine, D_DAOUT_SURF_STRIDE, + rubik_surface->dst_data.surf_stride); + if (rubik_op->mode == RUBIK_MODE_CONTRACT) { + reg = ((rubik_surface->dst_data.channel * + map_bpe[rubik_op->precision] + 31) >> 5) * + rubik_surface->src_data.surf_stride; + rubik_reg_write(engine, D_CONTRACT_STRIDE_0, reg); + + reg = rubik_op->stride_y * + rubik_surface->dst_data.line_stride; + rubik_reg_write(engine, D_CONTRACT_STRIDE_1, reg); + + reg = (((uint32_t)(rubik_op->stride_x-1)) << + SHIFT(RBK_D_DECONV_STRIDE_0, DECONV_X_STRIDE)) | + (((uint32_t)(rubik_op->stride_y-1)) << + SHIFT(RBK_D_DECONV_STRIDE_0, DECONV_Y_STRIDE)); + rubik_reg_write(engine, D_DECONV_STRIDE, reg); + } + } else { + rubik_reg_write(engine, D_DAOUT_PLANAR_STRIDE, + rubik_surface->dst_data.plane_stride); + } + +exit: + return ret; +} + +int +nvdla_rubik_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group) +{ + return 1; +} + +void +nvdla_rubik_dump_config(struct nvdla_processor_group *group) +{ + struct nvdla_rubik_op_desc *rubik_op; + struct nvdla_rubik_surface_desc *rubik_surface; + + rubik_surface = &group->surface_desc->rubik_surface; + rubik_op = &group->operation_desc->rubik_op; +} + +int +nvdla_rubik_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret = 0; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + if (!engine->config_data->rubik_enable) { + drm_err(nvdla_dev->drm, "RUBIK is not supported for this configuration\n"); + ret = -EINVAL; + goto exit; + } + + nvdla_enable_intr(engine, MASK(GLB_S_INTR_MASK_0, RUBIK_DONE_MASK1) | + MASK(GLB_S_INTR_MASK_0, RUBIK_DONE_MASK0)); + + ret = processor_rubik_program(engine, group); + if (ret) + goto exit; + +exit: + return ret; +} diff --git a/drivers/gpu/drm/nvdla/nvdla_sched.h b/drivers/gpu/drm/nvdla/nvdla_sched.h new file mode 100644 index 000000000000..b676b3fb6222 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_sched.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION. + * Copyright (C) 2022 Cai Huoqing + */ + +#ifndef __NVDLA_SCHED_H_ +#define __NVDLA_SCHED_H_ + +struct nvdla_task_desc { + /* platform specific data to communicate with portability layer */ + void *task_data; + /* task state */ + uint32_t state; + /* Task base address */ + uint64_t base; + /* start address of a list of nvdla_operation_container */ + uint64_t operation_desc_addr; + /* start address of a list of nvdla_surface_container */ + uint64_t surface_desc_addr; + /* start address of a list of nvdla_common_op_desc */ + uint64_t dependency_graph_addr; + /* start address of a list of nvdla_lut_param */ + uint64_t lut_data_addr; + /* + * start address of a list of nvdla_roi_desc, + * the first one is nvdla_roi_array_desc + * valid when network.dynamic_roi is true + */ + uint64_t roi_array_addr; + /* start address of a list of nvdla_surface_container */ + uint64_t surface_addr; + /* start address of a list of nvdla_stat_container */ + uint64_t stat_data_addr; +} __packed __aligned(256); + +#endif diff --git a/drivers/gpu/drm/nvdla/nvdla_scheduler.c b/drivers/gpu/drm/nvdla/nvdla_scheduler.c new file mode 100644 index 000000000000..efe1e1fe5c8f --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_scheduler.c @@ -0,0 +1,1012 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_common.h" +#include "nvdla_drv.h" +#include "nvdla_reg.h" +#include "nvdla_engine.h" + +#define MAX_NUM_ADDRESSES 256 + +static uint64_t roi_array_length __aligned(8); +static struct nvdla_network_desc network; + +static int +nvdla_update_consumers(struct nvdla_engine *engine, + struct nvdla_processor_group *group, + struct nvdla_common_op_desc *op, uint8_t event); + +static int32_t +nvdla_read_address_list(struct nvdla_engine *engine) +{ + return 0; +} + +int32_t +nvdla_read_lut(struct nvdla_engine *engine, int16_t index, void *dst) +{ + int32_t ret = 0; + uint64_t src_addr; + + if (index == -1) { + ret = -EINVAL; + goto exit; + } + + src_addr = engine->task->lut_data_addr; + + ret = nvdla_data_read(engine->driver_context, + engine->task->task_data, + src_addr, (void *)dst, + sizeof(struct nvdla_lut_param), + (sizeof(struct nvdla_lut_param) * (uint64_t)index)); + +exit: + return ret; +} + +static int +nvdla_op_enabled(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret; + struct nvdla_common_op_desc *op_desc; + + op_desc = group->op_desc; + + group->active = 1; + + /* update dependency graph for this task */ + ret = nvdla_update_consumers(engine, group, op_desc, NVDLA_EVENT_OP_ENABLED); + + return ret; +} + +static int +nvdla_op_programmed(struct nvdla_engine *engine, struct nvdla_processor *processor, + struct nvdla_processor_group *group, + uint8_t rdma_id) +{ + int32_t ret; + struct nvdla_common_op_desc *op_desc; + + op_desc = group->op_desc; + + group->pending = 0; + + /* update dependency graph for this task */ + ret = nvdla_update_consumers(engine, group, op_desc, NVDLA_EVENT_OP_PROGRAMMED); + + return ret; +} + +static int32_t +nvdla_read_config(struct nvdla_engine *engine, struct nvdla_task_desc *task, + struct nvdla_processor *processor, + struct nvdla_processor_group *group) +{ + int32_t ret; + uint64_t base; + int16_t index; + uint8_t roi_index; + + roi_index = group->roi_index; + index = group->op_desc->index; + + base = (sizeof(union nvdla_operation_container) * + (uint64_t)engine->network->num_operations * + (uint64_t)roi_index); + base = base + (sizeof(union nvdla_operation_container) * + (uint64_t)index); + + ret = nvdla_data_read(engine->driver_context, task->task_data, + task->operation_desc_addr, + (void *)group->operation_desc, + sizeof(union nvdla_operation_container), + base); + if (ret) + goto exit; + + base = (sizeof(union nvdla_surface_container) * + (uint64_t)engine->network->num_operations * + (uint64_t)roi_index); + + base = base + (sizeof(union nvdla_surface_container) * + (uint64_t)index); + + ret = nvdla_data_read(engine->driver_context, task->task_data, + task->surface_desc_addr, + (void *)group->surface_desc, + sizeof(union nvdla_surface_container), base); + if (ret) + goto exit; + + processor->dump_config(group); + +exit: + return ret; +} + +static void +nvdla_reset_group(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t i; + + for (i = 0; i < NVDLA_OP_NUM; i++) { + nvdla_put_op_desc(engine, group->consumers[i]); + group->consumers[i] = NULL; + } + + nvdla_put_op_desc(engine, group->fused_parent); + group->fused_parent = NULL; + + nvdla_put_op_desc(engine, group->op_desc); + group->op_desc = NULL; +} + +static int +nvdla_prepare_operation(struct nvdla_engine *engine, + struct nvdla_processor *processor, + struct nvdla_common_op_desc *op_desc, + uint8_t roi_index, uint32_t *group_number) +{ + int32_t ret = 0; + uint8_t group_id; + uint8_t rdma_id; + struct nvdla_processor_group *group; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + /* + * If not already programmed then find out if + * processor is free and which group is free + */ + ret = utils_get_free_group(engine, processor, &group_id, &rdma_id); + if (ret) { + drm_dbg(nvdla_dev->drm, "processor:%s register groups are busy\n", + processor->name); + goto exit; + } else { + drm_info(nvdla_dev->drm, + "processor:%s group:%d, rdma_group:%d available\n", + processor->name, group_id, rdma_id); + } + *group_number = group_id; + group = &processor->groups[group_id]; + + /* + * update operation descriptor + */ + group->op_desc = op_desc; + nvdla_get_refcount(op_desc); + group->id = group_id; + group->roi_index = roi_index; + group->rdma_id = rdma_id; + + ret = nvdla_read_config(engine, engine->task, processor, group); + if (ret) + goto exit; + + group->pending = 1; + + processor->group_status |= (1 << group->id); + + processor->rdma_check(group); + if (group->is_rdma_needed) { + group->rdma_id = rdma_id; + processor->rdma_status |= (1 << rdma_id); + } + + processor->tail_op = op_desc; +exit: + return ret; +} + +static int +nvdla_program_operation(struct nvdla_engine *engine, + struct nvdla_processor *processor, + struct nvdla_processor_group *group) +{ + int32_t i; + int32_t ret = 0; + struct nvdla_common_op_desc *op_desc; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + drm_info(nvdla_dev->drm, "Program %s operation index %d ROI %d Group[%d]\n", + processor->name, group->op_desc->index, group->roi_index, group->id); + + group->programming = 1; + + op_desc = group->op_desc; + + processor->set_producer(engine, group->id, group->rdma_id); + + ret = processor->program(engine, group); + if (ret) + goto exit; + + /* Pre-fetch consumers */ + for (i = 0; i < NVDLA_OP_NUM; i++) { + group->consumers[i] = nvdla_get_op_desc(engine, engine->task, + op_desc->consumers[i].index, i, + group->roi_index); + } + + group->fused_parent = nvdla_get_op_desc(engine, engine->task, + op_desc->fused_parent.index, + op_desc->op_type - 1, + group->roi_index); + + if (group->fused_parent != NULL) { + if (group->fused_parent->op_type != (op_desc->op_type - 1)) { + ret = -EINVAL; + goto exit; + } + } + + ret = nvdla_op_programmed(engine, processor, group, group->rdma_id); + if (!ret) + goto exit; + +exit: + group->programming = 0; + return ret; +} + +static int +nvdla_enable_operation(struct nvdla_engine *engine, + struct nvdla_processor *processor, + struct nvdla_common_op_desc *op_desc) +{ + int32_t ret = 0; + int32_t group_id; + struct nvdla_processor_group *group; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + /** + * If some operation has reported error then skip + * enabling next operations + */ + if (engine->status) + goto exit; + + /** + * Find out if operation is already programmed + */ + group_id = 0; + group = &processor->groups[group_id]; + if ((processor->group_status & (1 << group_id)) && + group->op_desc->index == op_desc->index && + group->roi_index == op_desc->roi_index && + !group->pending) + goto enable_op; + + group_id = 1; + group = &processor->groups[group_id]; + if ((processor->group_status & (1 << group_id)) && + group->op_desc->index == op_desc->index && + group->roi_index == op_desc->roi_index && + !group->pending) + goto enable_op; + + /** + * Operation is not programmed yet, ignore + */ + drm_dbg(nvdla_dev->drm, + "exit without actual enable due to processor hasn't been programmed\n"); + goto exit; + +enable_op: + /** + * If this event is triggered as part of programming same + * group then skip enable, it will get enabled after programming + * is complete + */ + if (group->programming) + goto exit; + + if (group->active) { + drm_dbg(nvdla_dev->drm, + "Processor:%s already enabled on group:%d\n", + processor->name, group_id); + goto exit; + } + + drm_info(nvdla_dev->drm, "Enable %s operation index %d ROI %d\n", + processor->name, group->op_desc->index, group->roi_index); + + processor->set_producer(engine, group->id, group->rdma_id); + + ret = processor->enable(engine, group); + if (ret) + goto exit; + + ret = nvdla_op_enabled(engine, group); +exit: + return ret; +} + +static int +nvdla_submit_operation(struct nvdla_engine *engine, + struct nvdla_processor *processor, + struct nvdla_common_op_desc *op_desc, + uint8_t roi_index) +{ + int32_t err; + uint32_t group_id = 0; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + drm_info(nvdla_dev->drm, "Prepare %s operation index %d ROI %d dep_count %d\n", + processor->name, op_desc->index, roi_index, + op_desc->dependency_count); + err = nvdla_prepare_operation(engine, processor, op_desc, roi_index, &group_id); + if (err) + goto exit; + + if (!processor->is_ready(processor, &processor->groups[group_id])) + goto exit; + + err = nvdla_program_operation(engine, processor, &processor->groups[group_id]); + if (err) + goto exit; + + if (op_desc->dependency_count == 0) + err = nvdla_enable_operation(engine, processor, op_desc); + +exit: + return err; +} + +/** + * Dequeue next operation of same type from list of operations + */ +static int32_t +nvdla_dequeue_operation(struct nvdla_engine *engine, + struct nvdla_processor *processor) +{ + int32_t ret = 0; + int16_t index; + struct nvdla_common_op_desc *consumer; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + if (engine->status) { + drm_dbg(nvdla_dev->drm, "Skip dequeue op as engine has reported error\n"); + goto exit; + } + + /** + * If we are done processing all ROIs for current op then + * load next op of same type otherwise reload same op for + * next ROI. + */ + if (processor->roi_index == (engine->network->num_rois - 1)) { + index = processor->tail_op->consumers[processor->op_type].index; + if (-1 == index) { + /** + * It means we are done processing + * all ops of this type + */ + drm_dbg(nvdla_dev->drm, "exit %s as there's no further operation\n", + processor->name); + goto exit; + } + processor->roi_index = 0; + } else { + processor->roi_index++; + index = processor->tail_op->index; + } + + drm_dbg(nvdla_dev->drm, "Dequeue op from %s processor, index=%d ROI=%d\n", + processor->name, index, processor->roi_index); + + /** + * Get operation descriptor + */ + consumer = nvdla_get_op_desc(engine, engine->task, index, + processor->op_type, processor->roi_index); + if (consumer == NULL) { + ret = -ENOMEM; + drm_err(nvdla_dev->drm, "Failed to allocate op_desc"); + goto exit; + } + + ret = nvdla_submit_operation(engine, processor, consumer, processor->roi_index); + nvdla_put_op_desc(engine, consumer); + +exit: + return ret; +} + +static int +nvdla_update_dependency(struct nvdla_engine *engine, + struct nvdla_consumer *consumer, + struct nvdla_common_op_desc *op_desc, + uint8_t event, uint8_t roi_index) +{ + int32_t ret = 0; + struct nvdla_processor *processor; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + if (consumer->index == -1) + goto exit; + + /* Update dependency only if event matches */ + if (event != consumer->event) + goto exit; + + /** + * If consumer index is valid but op desc is NULL means + * op desc for consumer was not pre-fetched + */ + if (op_desc == NULL) { + ret = -EINVAL; + drm_err(nvdla_dev->drm, "Operation descriptor is NULL, consumer index %d", + consumer->index); + goto exit; + } + + drm_dbg(nvdla_dev->drm, "Update dependency operation index %d ROI %d DEP_COUNT=%d\n", + op_desc->index, op_desc->roi_index, op_desc->dependency_count); + op_desc->dependency_count--; + + if (op_desc->dependency_count == 0) { + processor = &engine->processors[op_desc->op_type]; + drm_dbg(nvdla_dev->drm, "enable %s as dependency are resolved\n", + processor->name); + + ret = nvdla_enable_operation(engine, processor, op_desc); + if (ret) + goto exit; + } +exit: + return ret; +} + +static int +nvdla_update_consumers(struct nvdla_engine *engine, + struct nvdla_processor_group *group, + struct nvdla_common_op_desc *op, + uint8_t event) +{ + int32_t i; + int32_t ret = 0; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + if (engine->status) { + drm_dbg(nvdla_dev->drm, "Skip update as engine has reported error\n"); + goto exit; + } + + for (i = 0; i < NVDLA_OP_NUM; i++) { + ret = nvdla_update_dependency(engine, &op->consumers[i], + group->consumers[i], + event, group->roi_index); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to update dependency for consumer %d, ROI %d", + i, group->roi_index); + goto exit; + } + } + + ret = nvdla_update_dependency(engine, &op->fused_parent, + group->fused_parent, + event, group->roi_index); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to update dependency for " + "fused parent, ROI %d", group->roi_index); + goto exit; + } + +exit: + return ret; +} + +/** + * Handle operation completion notification + */ +int +nvdla_op_completion(struct nvdla_engine *engine, + struct nvdla_processor *processor, + struct nvdla_processor_group *group) +{ + int32_t ret; + struct nvdla_task_desc *task; + struct nvdla_common_op_desc *op_desc; + struct nvdla_processor_group *next_group; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + drm_info(nvdla_dev->drm, "Completed %s operation index %d ROI %d\n", + processor->name, group->op_desc->index, group->roi_index); + + task = engine->task; + + /** + * Mark OP as done only when all ROIs are done for that + * operation + */ + if (group->roi_index == (engine->network->num_rois - 1)) + engine->num_proc_hwl++; + + op_desc = group->op_desc; + + /** + * Get an extra reference count to keep op descriptor + * in cache until this operation completes + */ + nvdla_get_refcount(op_desc); + + processor->group_status &= ~(1 << group->id); + if (group->is_rdma_needed) { + group->is_rdma_needed = 0; + processor->rdma_status &= ~(1 << group->rdma_id); + group->rdma_id = 0; + } + group->active = 0; + group->lut_index = -1; + processor->last_group = group->id; + + /** + * Switch consumer pointer to next group + */ + processor->consumer_ptr = !group->id; + + /** + * update dependency graph for this task + * TODO: Add proper error handling + */ + ret = nvdla_update_consumers(engine, group, op_desc, NVDLA_EVENT_OP_COMPLETED); + if (ret) + goto exit; + + drm_info(nvdla_dev->drm, "%d HWLs done, totally %d layers\n", + engine->num_proc_hwl, engine->network->num_operations); + + /* free operation descriptor from cache */ + nvdla_reset_group(engine, group); + + /* if not hwl pending, means network completed */ + if (engine->network->num_operations == engine->num_proc_hwl) { + nvdla_put_op_desc(engine, op_desc); + goto exit; + } + + next_group = &processor->groups[!group->id]; + if (next_group->pending && !engine->status) { + /** + * Next group must be ready here for programming, + * if not means it is an error + */ + if (!processor->is_ready(processor, next_group)) + goto dequeue_op; + + ret = nvdla_program_operation(engine, processor, next_group); + if (ret) + goto exit; + + if (next_group->op_desc->dependency_count != 0) + goto dequeue_op; + + ret = nvdla_enable_operation(engine, processor, + next_group->op_desc); + if (ret) + goto exit; + } + +dequeue_op: + /* dequeue operation from this processor */ + ret = nvdla_dequeue_operation(engine, processor); + +exit: + nvdla_put_op_desc(engine, op_desc); + drm_dbg(nvdla_dev->drm, "Exit:%s processor %s group%u status=%d\n", + __func__, processor->name, group->id, ret); + + return ret; +} + +/** + * Read network configuration from DRAM, network descriptor address + * is always first in the address list. Network configuration contains + * offset in address list for addresses of other lists used to + * execute network + * + * @engine: Engine instance + * @return: 0 for success + */ +static int +nvdla_read_network_config(struct nvdla_engine *engine) +{ + int32_t ret; + uint64_t network_addr; + struct nvdla_task_desc *task = engine->task; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + /** + * Read address list from DRAM to DMEM + */ + ret = nvdla_read_address_list(engine); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to read address list"); + goto exit; + } + + /** + * Read network descriptor address from address list. It is always + * at index 0. + */ + ret = nvdla_get_dma_address(engine->driver_context, task->task_data, + 0, (void *)&network_addr, + DESTINATION_PROCESSOR); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to read network desc address"); + goto exit; + } + + /** + * Read network descriptor, it has information for a network + * such as all address indexes. + */ + ret = nvdla_data_read(engine->driver_context, task->task_data, + network_addr, (void *)&network, + sizeof(struct nvdla_network_desc), 0); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to read network descriptor\n"); + goto exit; + } + + if (network.num_operations == 0) + goto exit; + + /** + * Read operation descriptor list address from address list + */ + ret = nvdla_get_dma_address(engine->driver_context, task->task_data, + network.operation_desc_index, + (void *)&task->operation_desc_addr, + DESTINATION_PROCESSOR); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to read operation desc list address\n"); + goto exit; + } + + /** + * Read surface descriptor list address from address list + */ + ret = nvdla_get_dma_address(engine->driver_context, task->task_data, + network.surface_desc_index, + (void *)&task->surface_desc_addr, + DESTINATION_PROCESSOR); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to read surface desc list address\n"); + goto exit; + } + + /** + * Read dependency graph address from address list + */ + ret = nvdla_get_dma_address(engine->driver_context, task->task_data, + network.dependency_graph_index, + (void *)&task->dependency_graph_addr, + DESTINATION_PROCESSOR); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to ready dependency graph address\n"); + goto exit; + } + + /** + * Read LUT data list address from address list + */ + if (network.num_luts) { + ret = nvdla_get_dma_address(engine->driver_context, + task->task_data, + network.lut_data_index, + (void *)&task->lut_data_addr, + DESTINATION_PROCESSOR); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to read LUT list address\n"); + goto exit; + } + } + + /** + * Read address for ROI information + */ + if (network.dynamic_roi) { + /** + * Read ROI array address from address list + */ + ret = nvdla_get_dma_address(engine->driver_context, + task->task_data, + network.roi_array_index, + (void *)&task->roi_array_addr, + DESTINATION_PROCESSOR); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to read ROI array address\n"); + goto exit; + } + + ret = nvdla_data_read(engine->driver_context, task->task_data, + task->roi_array_addr, + (void *)&roi_array_length, + sizeof(uint64_t), 0); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to read ROI array length\n"); + goto exit; + } + + /** + * Number of ROIs detected can't be greater than maximum number + * ROIs this network can process + */ + if (roi_array_length > network.num_rois) { + drm_err(nvdla_dev->drm, "Invalid number of ROIs detected\n"); + ret = -EINVAL; + goto exit; + } + + network.num_rois = roi_array_length; + + /** + * Read surface address from address list + */ + ret = nvdla_get_dma_address(engine->driver_context, + task->task_data, + network.surface_index, + (void *)&task->surface_addr, + DESTINATION_DMA); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to read surface address"); + goto exit; + } + } + +exit: + return ret; +} + +static int +nvdla_initiate_processors(struct nvdla_engine *engine) +{ + int32_t i; + int32_t ret = 0; + int16_t index; + struct nvdla_processor *processor; + struct nvdla_common_op_desc *consumer; + struct nvdla_network_desc *nw; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + nw = engine->network; + + /* Validate operation heads before initiating processors */ + for (i = 0; i < NVDLA_OP_NUM; i++) { + if (nw->op_head[i] >= nw->num_operations) { + ret = -EINVAL; + drm_err(nvdla_dev->drm, "Invalid op_head %d for op %d", + nw->op_head[i], i); + goto exit; + } + } + + for (i = 0; i < NVDLA_OP_NUM; i++) { + index = nw->op_head[i]; + + /* If there is no op for this type then continue */ + if (-1 == index) + continue; + + consumer = nvdla_get_op_desc(engine, engine->task, index, i, 0); + /* + * if consumer is NULL, it means either data copy error + * or cache insufficient - we should fix it + **/ + if (consumer == NULL) { + drm_err(nvdla_dev->drm, + "Failed to allocate memory for op_head[%d]=%d", + i, index); + ret = -ENOMEM; + goto exit; + } + + processor = &engine->processors[consumer->op_type]; + + ret = nvdla_submit_operation(engine, processor, consumer, 0); + nvdla_put_op_desc(engine, consumer); + if (ret && ret != -EBUSY) { + drm_err(nvdla_dev->drm, "Failed to submit %s op from index %u\n", + processor->name, index); + goto exit; + } + + ret = nvdla_dequeue_operation(engine, processor); + if (ret) { + drm_err(nvdla_dev->drm, "Failed to dequeue op for %s processor", + processor->name); + goto exit; + } + } +exit: + return ret; +} + +static int +nvdla_handle_events(struct nvdla_engine *engine, struct nvdla_processor *processor) +{ + int32_t j; + int32_t ret = 0; + uint8_t group_id; + struct nvdla_processor_group *group; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + group_id = !processor->last_group; + + for (j = 0; j < NVDLA_NUM_GROUPS; j++) { + group = &processor->groups[group_id]; + + if ((1 << NVDLA_EVENT_CDMA_WT_DONE) & group->events) { + drm_info(nvdla_dev->drm, + "Handle cdma weight done event, processor %s group %u\n", + processor->name, group->id); + + ret = nvdla_update_consumers(engine, group, group->op_desc, + NVDLA_EVENT_CDMA_WT_DONE); + if (ret) + goto exit; + } + + if ((1 << NVDLA_EVENT_CDMA_DT_DONE) & group->events) { + drm_info(nvdla_dev->drm, + "Handle cdma data done event, processor %s group %u\n", + processor->name, group->id); + + ret = nvdla_update_consumers(engine, group, + group->op_desc, + NVDLA_EVENT_CDMA_DT_DONE); + if (ret) + goto exit; + } + + /** + * Handle complete after all other events + */ + if ((1 << NVDLA_EVENT_OP_COMPLETED) & group->events) { + drm_info(nvdla_dev->drm, + "Handle op complete event, processor %s group %u\n", + processor->name, group->id); + + ret = nvdla_op_completion(engine, processor, group); + if (ret) + goto exit; + } + + /** + * Clear all events + */ + group->events = 0; + group_id = !group_id; + } +exit: + return ret; +} + +int +nvdla_process_events(struct nvdla_engine *engine, uint32_t *task_complete) +{ + int32_t i; + int32_t ret = 0; + + for (i = 0; i < NVDLA_OP_NUM; i++) { + struct nvdla_processor *processor; + + processor = &engine->processors[i]; + ret = nvdla_handle_events(engine, processor); + /** + * Incase engine status is non-zero, then don't + * update the engine status. We should keep its + * status for later cleaning of engine. + */ + if (!engine->status) + engine->status = ret; + } + + if (engine->network->num_operations == engine->num_proc_hwl) + *task_complete = 1; + + return ret; +} + +/** + * Execute task selected by task scheduler + * + * 1. Read network configuration for the task + * 2. Initiate processors with head of list for same op + * 3. Start processing events received + */ +int +nvdla_execute_task(struct nvdla_engine *engine, void *task_data, void *config_data) +{ + int32_t ret; + + if (engine == NULL) { + ret = -EINVAL; + goto complete; + } + + if (engine->task == NULL) { + ret = -EINVAL; + goto complete; + } + + if (engine->task->task_data != NULL) { + /* We have on the fly tasks running */ + ret = -EBUSY; + goto complete; + } + + engine->task->task_data = task_data; + engine->config_data = config_data; + engine->network = &network; + engine->num_proc_hwl = 0; + + ret = nvdla_read_network_config(engine); + if (ret) + goto complete; + + /* If no operations in a task means nothing to do, NULL task */ + if (engine->network->num_operations == 0) + goto complete; + + ret = nvdla_initiate_processors(engine); + engine->status = ret; + +complete: + return ret; +} + +void +nvdla_clear_task(struct nvdla_engine *engine) +{ + int32_t i, j; + struct nvdla_device *nvdla_dev = + (struct nvdla_device *)engine->driver_context; + + for (i = 0; i < NVDLA_OP_NUM; i++) { + struct nvdla_processor *processor = &engine->processors[i]; + + processor->roi_index = 0; + processor->group_status = 0; + processor->rdma_status = 0; + + processor->tail_op = NULL; + + for (j = 0; j < NVDLA_NUM_GROUPS; j++) { + struct nvdla_processor_group *group = + &processor->groups[j]; + + group->rdma_id = group->id; + group->active = 0; + group->events = 0; + group->roi_index = 0; + group->is_rdma_needed = 0; + group->lut_index = -1; + } + } + + engine->task->task_data = NULL; + engine->network = NULL; + engine->num_proc_hwl = 0; + engine->status = 0; + + drm_info(nvdla_dev->drm, "reset engine done\n"); +} diff --git a/drivers/gpu/drm/nvdla/nvdla_sdp.c b/drivers/gpu/drm/nvdla/nvdla_sdp.c new file mode 100644 index 000000000000..a7cd67d2e977 --- /dev/null +++ b/drivers/gpu/drm/nvdla/nvdla_sdp.c @@ -0,0 +1,723 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Copyright (C) 2017-2018 NVIDIA CORPORATION + * Copyright (C) 2022 Cai Huoqing + */ + +#include "nvdla_drv.h" +#include "nvdla_reg.h" +#include "nvdla_common.h" +#include "nvdla_engine.h" + +static const uint8_t map_ena[] = { + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DISABLE, YES), + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DISABLE, NO), +}; + +static const uint8_t map_prelu[] = { + FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_MUL_PRELU, NO), + FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_MUL_PRELU, YES), +}; + +static const uint8_t map_bypass[] = { + FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_BYPASS, YES), + FIELD_ENUM(SDP_D_DP_BS_CFG_0, BS_BYPASS, NO), +}; + +static const uint8_t map_alu_op[] = { + FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, MAX), + FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, MIN), + FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, SUM), + FIELD_ENUM(SDP_D_DP_EW_CFG_0, EW_ALU_ALGO, EQL), +}; + +static const uint8_t map_alu_src[] = { + FIELD_ENUM(SDP_D_DP_BS_ALU_CFG_0, BS_ALU_SRC, MEM), + FIELD_ENUM(SDP_D_DP_BS_ALU_CFG_0, BS_ALU_SRC, REG), +}; + +static const uint8_t map_fly[] = { + FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, FLYING_MODE, OFF), + FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, FLYING_MODE, ON), +}; + +static const uint8_t map_dst[] = { + FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, OUTPUT_DST, MEM), + FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, OUTPUT_DST, PDP), +}; + + +static const uint8_t map_wg[] = { + FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, WINOGRAD, OFF), + FIELD_ENUM(SDP_D_FEATURE_MODE_CFG_0, WINOGRAD, ON), +}; + +static const uint8_t map_precision[] = { + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8), + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT16), + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16), +}; + +static const uint32_t map_proc_precision[3][3] = { + { + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8), + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8), + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16), + }, + { + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8), + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT16), + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16), + }, + { + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT8), + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, INT16), + FIELD_ENUM(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION, FP16), + }, +}; + +static const uint8_t map_op_type[] = { + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, MUL), + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, MUL), + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, ALU), + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_USE, BOTH), +}; + +static const uint8_t map_element_size[] = { + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_SIZE, ONE_BYTE), + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_SIZE, TWO_BYTE), + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_SIZE, TWO_BYTE), +}; + +static const uint8_t map_op_mode[] = { + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_MODE, PER_ELEMENT), + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_MODE, PER_KERNEL), + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DATA_MODE, PER_ELEMENT), +}; + +static const uint8_t map_ram_type[] = { + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_RAM_TYPE, MC), + FIELD_ENUM(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_RAM_TYPE, CV), +}; + +static const uint8_t map_perf_dma[] = { + FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_DMA_EN, NO), + FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_DMA_EN, YES), +}; + +static const uint8_t map_perf_lut[] = { + FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_LUT_EN, NO), + FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_LUT_EN, YES), +}; + +static const uint8_t map_perf_sat[] = { + FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_SAT_EN, NO), + FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_SAT_EN, YES), +}; + +static const uint8_t map_perf_nan_inf[] = { + FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_NAN_INF_COUNT_EN, NO), + FIELD_ENUM(SDP_D_PERF_ENABLE_0, PERF_NAN_INF_COUNT_EN, YES), +}; + +void +nvdla_sdp_set_producer(struct nvdla_engine *engine, int32_t group_id, int32_t rdma_group_id) +{ + uint32_t reg; + + reg = group_id << SHIFT(SDP_S_POINTER_0, PRODUCER); + sdp_reg_write(engine, S_POINTER, reg); + reg = rdma_group_id << SHIFT(SDP_RDMA_S_POINTER_0, PRODUCER); + sdp_rdma_reg_write(engine, S_POINTER, reg); +} + +int +nvdla_sdp_enable(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + uint32_t reg; + + /* enable all sub-modules */ + if (group->is_rdma_needed) { + reg = FIELD_ENUM(SDP_RDMA_D_OP_ENABLE_0, OP_EN, ENABLE); + sdp_rdma_reg_write(engine, D_OP_ENABLE, reg); + } + reg = FIELD_ENUM(SDP_D_OP_ENABLE_0, OP_EN, ENABLE); + sdp_reg_write(engine, D_OP_ENABLE, reg); + + return 0; +} + +void +nvdla_sdp_rdma_check(struct nvdla_processor_group *group) +{ + uint8_t x1_rdma_ena; + uint8_t x2_rdma_ena; + uint8_t y_rdma_ena; + uint8_t fly; + struct nvdla_sdp_op_desc *sdp_op; + struct nvdla_sdp_surface_desc *sdp_surface; + + sdp_op = &group->operation_desc->sdp_op; + sdp_surface = &group->surface_desc->sdp_surface; + + x1_rdma_ena = sdp_op->x1_op.enable; + x2_rdma_ena = sdp_op->x2_op.enable; + y_rdma_ena = sdp_op->y_op.enable; + + x1_rdma_ena &= (sdp_op->x1_op.mode != SDP_OP_PER_LAYER); + x2_rdma_ena &= (sdp_op->x2_op.mode != SDP_OP_PER_LAYER); + y_rdma_ena &= (sdp_op->y_op.mode != SDP_OP_PER_LAYER); + + fly = sdp_surface->src_data.type == NVDLA_MEM_HW; + + group->is_rdma_needed = (!fly) || (x1_rdma_ena || + x2_rdma_ena || y_rdma_ena); +} + +static int32_t +processor_sdp_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret = 0; + uint64_t src_addr = -1, x1_addr = -1, x2_addr = -1; + uint64_t y_addr = -1, dst_addr = -1; + uint32_t reg, high, low; + uint8_t fly; + uint32_t atom_size; + struct nvdla_sdp_op *x1_op; + struct nvdla_sdp_op *x2_op; + struct nvdla_sdp_op *y_op; + uint8_t x1_rdma_ena; + uint8_t x2_rdma_ena; + uint8_t y_rdma_ena; + uint8_t out_dma_ena; + struct nvdla_lut_param lut; + struct nvdla_sdp_op_desc *sdp_op; + struct nvdla_sdp_surface_desc *sdp_surface; + + atom_size = engine->config_data->atom_size; + + sdp_op = &group->operation_desc->sdp_op; + sdp_surface = &group->surface_desc->sdp_surface; + + fly = sdp_surface->src_data.type == NVDLA_MEM_HW; + out_dma_ena = sdp_surface->dst_data.type != NVDLA_MEM_HW; + x1_op = &sdp_op->x1_op; + x2_op = &sdp_op->x2_op; + y_op = &sdp_op->y_op; + x1_rdma_ena = x1_op->enable && x1_op->type != SDP_OP_NONE; + x2_rdma_ena = x2_op->enable && x2_op->type != SDP_OP_NONE; + y_rdma_ena = y_op->enable && y_op->type != SDP_OP_NONE; + + /* load address */ + if (!fly) { + ret = nvdla_read_input_address(engine, &sdp_surface->src_data, + &src_addr, + group->op_desc->index, + group->roi_index, + 1); + if (ret) + goto exit; + } + + if (out_dma_ena) { + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + sdp_surface->dst_data.address, + sdp_surface->dst_data.offset, + (void *)&dst_addr, + DESTINATION_DMA); + } + + if (sdp_op->lut_index >= 0) { + group->lut_index = sdp_op->lut_index; + nvdla_read_lut(engine, sdp_op->lut_index, (void *)&lut); + } + + + x1_rdma_ena &= (x1_op->mode != SDP_OP_PER_LAYER); + x2_rdma_ena &= (x2_op->mode != SDP_OP_PER_LAYER); + y_rdma_ena &= (y_op->mode != SDP_OP_PER_LAYER); + + if (x1_rdma_ena) { + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + sdp_surface->x1_data.address, + sdp_surface->x1_data.offset, + (void *)&x1_addr, + DESTINATION_DMA); + } + if (x2_rdma_ena) { + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + sdp_surface->x2_data.address, + sdp_surface->x2_data.offset, + (void *)&x2_addr, + DESTINATION_DMA); + } + if (y_rdma_ena) { + nvdla_get_dma_cube_address(engine->driver_context, + engine->task->task_data, + sdp_surface->y_data.address, + sdp_surface->y_data.offset, + (void *)&y_addr, + DESTINATION_DMA); + } + + reg = (map_fly[0] << SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, FLYING_MODE)); + sdp_rdma_reg_write(engine, D_FEATURE_MODE_CFG, reg); + + reg = (map_ena[1] << SHIFT(SDP_RDMA_D_BRDMA_CFG_0, BRDMA_DISABLE)); + sdp_rdma_reg_write(engine, D_BRDMA_CFG, reg); + reg = (map_ena[1] << SHIFT(SDP_RDMA_D_NRDMA_CFG_0, NRDMA_DISABLE)); + sdp_rdma_reg_write(engine, D_NRDMA_CFG, reg); + reg = (map_ena[1] << SHIFT(SDP_RDMA_D_ERDMA_CFG_0, ERDMA_DISABLE)); + sdp_rdma_reg_write(engine, D_ERDMA_CFG, reg); + + reg = (map_fly[fly] << + SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, FLYING_MODE)) | + (map_wg[sdp_op->conv_mode == CONV_MODE_WINOGRAD] << + SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, WINOGRAD)) | + (map_precision[sdp_op->src_precision] << + SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, IN_PRECISION)) | + (map_precision[sdp_op->dst_precision] << + SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, OUT_PRECISION)) | + (map_proc_precision[sdp_op->dst_precision][sdp_op->src_precision] << + SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, PROC_PRECISION)) | + ((sdp_op->batch_num-1) << + SHIFT(SDP_RDMA_D_FEATURE_MODE_CFG_0, BATCH_NUMBER)); + sdp_rdma_reg_write(engine, D_FEATURE_MODE_CFG, reg); + + if (group->is_rdma_needed) { + + sdp_rdma_reg_write(engine, D_DATA_CUBE_WIDTH, + sdp_surface->src_data.width - 1); + sdp_rdma_reg_write(engine, D_DATA_CUBE_HEIGHT, + sdp_surface->src_data.height - 1); + sdp_rdma_reg_write(engine, D_DATA_CUBE_CHANNEL, + sdp_surface->src_data.channel - 1); + + /* config SDP source info */ + if (!fly) { + /** + * if not on-the-fly, we have to config + * the source cube info + */ + high = upper_32_bits(src_addr); + low = lower_32_bits(src_addr); + sdp_rdma_reg_write(engine, D_SRC_BASE_ADDR_LOW, low); + sdp_rdma_reg_write(engine, D_SRC_BASE_ADDR_HIGH, high); + sdp_rdma_reg_write(engine, D_SRC_LINE_STRIDE, + sdp_surface->src_data.line_stride); + sdp_rdma_reg_write(engine, D_SRC_SURFACE_STRIDE, + sdp_surface->src_data.surf_stride); + sdp_rdma_reg_write(engine, D_SRC_DMA_CFG, + map_ram_type[sdp_surface->src_data.type]); + } + + /* config x1 source info */ + reg = (map_ena[x1_rdma_ena] << + SHIFT(SDP_RDMA_D_BRDMA_CFG_0, + BRDMA_DISABLE)) | + (map_op_type[x1_op->type] << + SHIFT(SDP_RDMA_D_BRDMA_CFG_0, + BRDMA_DATA_USE)) | + (map_element_size[x1_op->precision] << + SHIFT(SDP_RDMA_D_BRDMA_CFG_0, + BRDMA_DATA_SIZE)) | + (map_op_mode[x1_op->mode] << + SHIFT(SDP_RDMA_D_BRDMA_CFG_0, + BRDMA_DATA_MODE)) | + (map_ram_type[sdp_surface->x1_data.type] << + SHIFT(SDP_RDMA_D_BRDMA_CFG_0, + BRDMA_RAM_TYPE)); + sdp_rdma_reg_write(engine, D_BRDMA_CFG, reg); + + if (x1_rdma_ena) { + high = upper_32_bits(x1_addr); + low = lower_32_bits(x1_addr); + sdp_rdma_reg_write(engine, D_BS_BASE_ADDR_LOW, + low); + sdp_rdma_reg_write(engine, D_BS_BASE_ADDR_HIGH, + high); + sdp_rdma_reg_write(engine, D_BS_LINE_STRIDE, + sdp_surface->x1_data.line_stride); + sdp_rdma_reg_write(engine, D_BS_SURFACE_STRIDE, + sdp_surface->x1_data.surf_stride); + } + + /* config x2 source info */ + reg = (map_ena[x2_rdma_ena] << + SHIFT(SDP_RDMA_D_NRDMA_CFG_0, + NRDMA_DISABLE)) | + (map_op_type[x2_op->type] << + SHIFT(SDP_RDMA_D_NRDMA_CFG_0, + NRDMA_DATA_USE)) | + (map_element_size[x2_op->precision] << + SHIFT(SDP_RDMA_D_NRDMA_CFG_0, + NRDMA_DATA_SIZE)) | + (map_op_mode[x2_op->mode] << + SHIFT(SDP_RDMA_D_NRDMA_CFG_0, + NRDMA_DATA_MODE)) | + (map_ram_type[sdp_surface->x2_data.type] << + SHIFT(SDP_RDMA_D_NRDMA_CFG_0, + NRDMA_RAM_TYPE)); + + sdp_rdma_reg_write(engine, D_NRDMA_CFG, reg); + + if (x2_rdma_ena) { + high = upper_32_bits(x2_addr); + low = lower_32_bits(x2_addr); + sdp_rdma_reg_write(engine, D_BN_BASE_ADDR_LOW, + low); + sdp_rdma_reg_write(engine, D_BN_BASE_ADDR_HIGH, + high); + sdp_rdma_reg_write(engine, D_BN_LINE_STRIDE, + sdp_surface->x2_data.line_stride); + sdp_rdma_reg_write(engine, D_BN_SURFACE_STRIDE, + sdp_surface->x2_data.surf_stride); + } + + /* config y source info */ + reg = (map_ena[y_rdma_ena] << + SHIFT(SDP_RDMA_D_ERDMA_CFG_0, + ERDMA_DISABLE)) | + (map_op_type[y_op->type] << + SHIFT(SDP_RDMA_D_ERDMA_CFG_0, + ERDMA_DATA_USE)) | + (map_element_size[y_op->precision] << + SHIFT(SDP_RDMA_D_ERDMA_CFG_0, + ERDMA_DATA_SIZE)) | + (map_op_mode[y_op->mode] << + SHIFT(SDP_RDMA_D_ERDMA_CFG_0, + ERDMA_DATA_MODE)) | + (map_ram_type[sdp_surface->y_data.type] << + SHIFT(SDP_RDMA_D_ERDMA_CFG_0, + ERDMA_RAM_TYPE)); + + sdp_rdma_reg_write(engine, D_ERDMA_CFG, reg); + if (y_rdma_ena) { + high = upper_32_bits(y_addr); + low = lower_32_bits(y_addr); + sdp_rdma_reg_write(engine, D_EW_BASE_ADDR_LOW, + low); + sdp_rdma_reg_write(engine, D_EW_BASE_ADDR_HIGH, + high); + sdp_rdma_reg_write(engine, D_EW_LINE_STRIDE, + sdp_surface->y_data.line_stride); + sdp_rdma_reg_write(engine, D_EW_SURFACE_STRIDE, + sdp_surface->y_data.surf_stride); + } + } + + if (sdp_op->lut_index >= 0) + update_lut(engine, SDP_S_LUT_ACCESS_CFG_0, &lut, + sdp_op->src_precision); + + sdp_reg_write(engine, D_DATA_CUBE_WIDTH, sdp_surface->src_data.width - 1); + sdp_reg_write(engine, D_DATA_CUBE_HEIGHT, sdp_surface->src_data.height - 1); + sdp_reg_write(engine, D_DATA_CUBE_CHANNEL, sdp_surface->src_data.channel - 1); + + if (out_dma_ena) { + high = upper_32_bits(dst_addr); + low = lower_32_bits(dst_addr); + sdp_reg_write(engine, D_DST_BASE_ADDR_HIGH, + high); + sdp_reg_write(engine, D_DST_BASE_ADDR_LOW, + low); + sdp_reg_write(engine, D_DST_LINE_STRIDE, + sdp_surface->dst_data.line_stride); + sdp_reg_write(engine, D_DST_SURFACE_STRIDE, + sdp_surface->dst_data.surf_stride); + } + + /* Config BS module */ + reg = (map_bypass[x1_op->enable] << + SHIFT(SDP_D_DP_BS_CFG_0, + BS_BYPASS)) | + (map_bypass[x1_op->type != SDP_OP_MUL && + x1_op->type != SDP_OP_NONE] << + SHIFT(SDP_D_DP_BS_CFG_0, + BS_ALU_BYPASS)) | + (map_alu_op[x1_op->alu_type] << + SHIFT(SDP_D_DP_BS_CFG_0, + BS_ALU_ALGO)) | + (map_bypass[x1_op->type != SDP_OP_ADD && + x1_op->type != SDP_OP_NONE] << + SHIFT(SDP_D_DP_BS_CFG_0, + BS_MUL_BYPASS)) | + (map_prelu[x1_op->act == ACTIVATION_PRELU] + << SHIFT(SDP_D_DP_BS_CFG_0, + BS_MUL_PRELU)) | + (map_bypass[x1_op->act == ACTIVATION_RELU] << + SHIFT(SDP_D_DP_BS_CFG_0, + BS_RELU_BYPASS)); + sdp_reg_write(engine, D_DP_BS_CFG, reg); + + if (x1_op->enable) { + if (x1_op->type == SDP_OP_ADD || + x1_op->type == SDP_OP_BOTH) { + reg = (map_alu_src[x1_op->mode == SDP_OP_PER_LAYER] << + SHIFT(SDP_D_DP_BS_ALU_CFG_0, + BS_ALU_SRC)) | + (x1_op->shift_value << + SHIFT(SDP_D_DP_BS_ALU_CFG_0, + BS_ALU_SHIFT_VALUE)); + sdp_reg_write(engine, D_DP_BS_ALU_CFG, reg); + } + + if (x1_op->mode == SDP_OP_PER_LAYER) { + sdp_reg_write(engine, D_DP_BS_ALU_SRC_VALUE, + x1_op->alu_operand); + sdp_reg_write(engine, D_DP_BS_MUL_SRC_VALUE, + x1_op->mul_operand); + } + + /** + * MUL truncate will take effect no matter + * MUL is bypassed or not + */ + reg = (map_alu_src[x1_op->mode == SDP_OP_PER_LAYER] << + SHIFT(SDP_D_DP_BS_MUL_CFG_0, + BS_MUL_SRC)) | + (x1_op->truncate << + SHIFT(SDP_D_DP_BS_MUL_CFG_0, + BS_MUL_SHIFT_VALUE)); + sdp_reg_write(engine, D_DP_BS_MUL_CFG, reg); + } + + /* Config BN module */ + reg = (map_bypass[x2_op->enable] << + SHIFT(SDP_D_DP_BN_CFG_0, + BN_BYPASS)) | + (map_bypass[x2_op->type != SDP_OP_MUL && + x2_op->type != SDP_OP_NONE] << + SHIFT(SDP_D_DP_BN_CFG_0, + BN_ALU_BYPASS)) | + (map_alu_op[x2_op->alu_type] << + SHIFT(SDP_D_DP_BN_CFG_0, + BN_ALU_ALGO)) | + (map_bypass[x2_op->type != SDP_OP_ADD && + x2_op->type != SDP_OP_NONE] << + SHIFT(SDP_D_DP_BN_CFG_0, + BN_MUL_BYPASS)) | + (map_prelu[x2_op->act == ACTIVATION_PRELU] + << SHIFT(SDP_D_DP_BN_CFG_0, + BN_MUL_PRELU)) | + (map_bypass[x2_op->act == ACTIVATION_RELU] + << SHIFT(SDP_D_DP_BN_CFG_0, + BN_RELU_BYPASS)); + sdp_reg_write(engine, D_DP_BN_CFG, reg); + + if (x2_op->enable) { + if (x2_op->type == SDP_OP_ADD || + x2_op->type == SDP_OP_BOTH) { + reg = (map_alu_src[x2_op->mode == SDP_OP_PER_LAYER] << + SHIFT(SDP_D_DP_BN_ALU_CFG_0, + BN_ALU_SRC)) | + (x2_op->shift_value << + SHIFT(SDP_D_DP_BN_ALU_CFG_0, + BN_ALU_SHIFT_VALUE)); + sdp_reg_write(engine, D_DP_BN_ALU_CFG, reg); + } + + if (x2_op->mode == SDP_OP_PER_LAYER) { + sdp_reg_write(engine, D_DP_BN_ALU_SRC_VALUE, + x2_op->alu_operand); + sdp_reg_write(engine, D_DP_BN_MUL_SRC_VALUE, + x2_op->mul_operand); + } + + reg = (map_alu_src[x2_op->mode == SDP_OP_PER_LAYER] << + SHIFT(SDP_D_DP_BN_MUL_CFG_0, + BN_MUL_SRC)) | + (x2_op->truncate << + SHIFT(SDP_D_DP_BN_MUL_CFG_0, + BN_MUL_SHIFT_VALUE)); + sdp_reg_write(engine, D_DP_BN_MUL_CFG, reg); + } + + /* Config EW module */ + reg = (map_bypass[y_op->enable] << + SHIFT(SDP_D_DP_EW_CFG_0, + EW_BYPASS)) | + (map_bypass[y_op->type != SDP_OP_MUL && + y_op->type != SDP_OP_NONE] << + SHIFT(SDP_D_DP_EW_CFG_0, + EW_ALU_BYPASS)) | + (map_alu_op[y_op->alu_type] << + SHIFT(SDP_D_DP_EW_CFG_0, + EW_ALU_ALGO)) | + (map_bypass[y_op->type != SDP_OP_ADD && + y_op->type != SDP_OP_NONE] << + SHIFT(SDP_D_DP_EW_CFG_0, + EW_MUL_BYPASS)) | + ((map_prelu[y_op->act == ACTIVATION_PRELU]) << + SHIFT(SDP_D_DP_EW_CFG_0, + EW_MUL_PRELU)) | + (map_bypass[y_op->act == ACTIVATION_LUT] << + SHIFT(SDP_D_DP_EW_CFG_0, + EW_LUT_BYPASS)); + sdp_reg_write(engine, D_DP_EW_CFG, reg); + + if (y_op->enable) { + if (y_op->type == SDP_OP_ADD || y_op->type == SDP_OP_BOTH) { + reg = (map_alu_src[y_op->mode == SDP_OP_PER_LAYER] << + SHIFT(SDP_D_DP_EW_ALU_CFG_0, + EW_ALU_SRC)) | + (map_bypass[y_op->cvt.alu_cvt.enable] << + SHIFT(SDP_D_DP_EW_ALU_CFG_0, + EW_ALU_CVT_BYPASS)); + sdp_reg_write(engine, D_DP_EW_ALU_CFG, reg); + + if (y_op->mode == SDP_OP_PER_LAYER) { + sdp_reg_write(engine, D_DP_EW_ALU_SRC_VALUE, + y_op->alu_operand); + } else { + sdp_reg_write(engine, D_DP_EW_ALU_CVT_OFFSET_VALUE, + y_op->cvt.alu_cvt.offset); + sdp_reg_write(engine, D_DP_EW_ALU_CVT_SCALE_VALUE, + y_op->cvt.alu_cvt.scale); + sdp_reg_write(engine, D_DP_EW_ALU_CVT_TRUNCATE_VALUE, + y_op->cvt.alu_cvt.truncate); + } + } + + if (y_op->type == SDP_OP_MUL || y_op->type == SDP_OP_BOTH) { + reg = (map_alu_src[y_op->mode == SDP_OP_PER_LAYER] << + SHIFT(SDP_D_DP_EW_MUL_CFG_0, + EW_MUL_SRC)) | + (map_bypass[y_op->cvt.mul_cvt.enable] << + SHIFT(SDP_D_DP_EW_MUL_CFG_0, + EW_MUL_CVT_BYPASS)); + sdp_reg_write(engine, D_DP_EW_MUL_CFG, reg); + + if (y_op->mode == SDP_OP_PER_LAYER) { + sdp_reg_write(engine, D_DP_EW_MUL_SRC_VALUE, + y_op->mul_operand); + } else { + sdp_reg_write(engine, D_DP_EW_MUL_CVT_OFFSET_VALUE, + y_op->cvt.mul_cvt.offset); + sdp_reg_write(engine, D_DP_EW_MUL_CVT_SCALE_VALUE, + y_op->cvt.mul_cvt.scale); + sdp_reg_write(engine, D_DP_EW_MUL_CVT_TRUNCATE_VALUE, + y_op->cvt.mul_cvt.truncate); + } + } + + sdp_reg_write(engine, D_DP_EW_TRUNCATE_VALUE, y_op->truncate); + } + + reg = (map_fly[sdp_surface->src_data.type == NVDLA_MEM_HW] << + SHIFT(SDP_D_FEATURE_MODE_CFG_0, + FLYING_MODE)) | + (map_dst[sdp_surface->dst_data.type == NVDLA_MEM_HW] << + SHIFT(SDP_D_FEATURE_MODE_CFG_0, + OUTPUT_DST)) | + (map_wg[sdp_op->conv_mode == CONV_MODE_WINOGRAD] << + SHIFT(SDP_D_FEATURE_MODE_CFG_0, + WINOGRAD)) | + ((sdp_op->batch_num - 1) << + SHIFT(SDP_D_FEATURE_MODE_CFG_0, + BATCH_NUMBER)); + sdp_reg_write(engine, D_FEATURE_MODE_CFG, reg); + sdp_reg_write(engine, D_DST_DMA_CFG, + map_ram_type[sdp_surface->dst_data.type]); + if (sdp_op->batch_num > 1) + sdp_reg_write(engine, D_DST_BATCH_STRIDE, sdp_op->batch_stride); + + reg = + (map_proc_precision[sdp_op->dst_precision][sdp_op->src_precision] << + SHIFT(SDP_D_DATA_FORMAT_0, + PROC_PRECISION)) | + (map_precision[sdp_op->dst_precision] << + SHIFT(SDP_D_DATA_FORMAT_0, + OUT_PRECISION)); + sdp_reg_write(engine, D_DATA_FORMAT, reg); + sdp_reg_write(engine, D_CVT_OFFSET, sdp_op->out_cvt.offset); + sdp_reg_write(engine, D_CVT_SCALE, sdp_op->out_cvt.scale); + sdp_reg_write(engine, D_CVT_SHIFT, sdp_op->out_cvt.truncate); + +exit: + return ret; +} + +int +nvdla_sdp_is_ready(struct nvdla_processor *processor, + struct nvdla_processor_group *group) +{ + struct nvdla_processor_group *next_group; + struct nvdla_sdp_op_desc *sdp_op; + + sdp_op = &group->operation_desc->sdp_op; + next_group = &processor->groups[!group->id]; + + /** + * Single LUT is shared between two SDP groups, need to make + * sure that usage does not conflict. Also, LUT write + * access is locked when SDP sub-engine is active, so delay + * writing LUT when another group is active. + */ + + /** + * if no LUT required for current group then it can be programmed + * without further checks + */ + if (sdp_op->lut_index == -1) + return 1; + + /** + * if same LUT is used for both groups then it can be programmed + * without more checks. Even if another group is active and LUT + * is locked, it would have been programmed by another group. + */ + if (next_group->lut_index == sdp_op->lut_index) + return 1; + + /** + * if LUT index of another group is not -1 means some LUT is programmed, + * then do not program current LUT as we already know current LUT is not + * -1 and neither same as another group. + */ + if (next_group->lut_index != -1) + return 0; + + /** + * if current group needs LUT different than another group and that + * group is not active then program it. + */ + if (!next_group->active) + return 1; + + /** + * if control is here it means current group is using LUT different than + * another group and that group is active. Wait for another group to + * become idle. + */ + return 0; +} + +void +nvdla_sdp_dump_config(struct nvdla_processor_group *group) +{ + struct nvdla_sdp_op_desc *sdp_op; + struct nvdla_sdp_surface_desc *sdp_surface; + + sdp_surface = &group->surface_desc->sdp_surface; + sdp_op = &group->operation_desc->sdp_op; +} + +int +nvdla_sdp_program(struct nvdla_engine *engine, struct nvdla_processor_group *group) +{ + int32_t ret; + + nvdla_enable_intr(engine, MASK(GLB_S_INTR_MASK_0, SDP_DONE_MASK1) | + MASK(GLB_S_INTR_MASK_0, SDP_DONE_MASK0)); + + ret = processor_sdp_program(engine, group); + if (ret) + goto exit; + +exit: + return ret; +} -- 2.25.1