On 29/08/2023 17:15, Boris Brezillon wrote: > On Wed, 16 Aug 2023 17:01:56 +0100 > Steven Price <steven.price@xxxxxxx> wrote: > >> On 09/08/2023 17:53, Boris Brezillon wrote: [...] >>> +/** >>> + * panthor_fw_mem_alloc() - Allocate a FW memory object and map it to the MCU VM. >>> + * @ptdev: Device. >>> + * @size: Size of the memory block. >>> + * @bo_flags: BO flags. >>> + * @vm_map_flags: VM_MAP flags. >>> + * @va: Virtual address of the MCU mapping. >>> + * Set to PANTHOR_GEM_ALLOC_VA for automatic VA-assignment. In that case, the >>> + * VA will be allocated in the shared VA space. >>> + * >>> + * Return: A valid pointer in case of success, an ERR_PTR() otherwise. >>> + */ >>> +static struct panthor_fw_mem * >>> +panthor_fw_mem_alloc(struct panthor_device *ptdev, size_t size, >>> + u32 bo_flags, u32 vm_map_flags, u64 va) >>> +{ >>> + struct panthor_fw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); >>> + int ret; >>> + >>> + if (!mem) >>> + return ERR_PTR(-ENOMEM); >>> + >>> + mem->bo = panthor_gem_create_and_map(ptdev, ptdev->fw->vm, >>> + size, bo_flags, vm_map_flags, >>> + &va, NULL); >>> + if (IS_ERR(mem->bo)) { >>> + ret = PTR_ERR(mem->bo); >>> + mem->bo = NULL; >>> + goto err_free_mem; >>> + } >>> + >>> + mem->va = va; >>> + return mem; >>> + >>> +err_free_mem: >>> + panthor_fw_mem_free(ptdev, mem); >>> + return ERR_PTR(ret); >> >> The error handling seems more complex than needed, how about: >> >> struct panthor_fw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); >> struct panthor_gem_object *bo; >> int ret; >> >> if (!mem) >> return ERR_PTR(-ENOMEM); >> >> bo = panthor_gem_create_and_map(ptdev, ptdev->fw->vm, >> size, bo_flags, vm_map_flags, >> &va, NULL); >> >> if (IS_ERR(bo)) { >> kfree(mem); >> return ERR_CAST(bo); >> } >> >> mem->bo = bo; >> mem->va = va; >> return mem; >> >> Which I think also means we don't need the "if (mem->bo)" case in >> panthor_fw_mem_free(). > > Not so sure about that one. I've been adding code to existing functions > and having a structured error path, with free functions that can deal > with partially initialized object makes code addition less error-prone. > I agree on the local bo variable to avoid mem->bo re-initialization > though. Yeah the "free accepting NULL" style is generally a good one, so leaving the NULL check in panthor_fw_mem_free() is fine. It was just in this case having to explicitly assign NULL before the call to panthor_fw_mem_free() looked ugly. >> >>> +} >>> + > > [...] > >>> +/** >>> + * panthor_fw_alloc_suspend_buf_mem() - Allocate a suspend buffer for a command stream group. >>> + * @ptdev: Device. >>> + * @size: Size of the suspend buffer. >>> + * >>> + * Return: A valid pointer in case of success, an ERR_PTR() otherwise. >>> + */ >>> +struct panthor_fw_mem * >>> +panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev, size_t size) >>> +{ >>> + return panthor_fw_mem_alloc(ptdev, size, >>> + DRM_PANTHOR_BO_NO_MMAP, >>> + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC, >>> + PANTHOR_GEM_ALLOC_VA); >>> +} >>> + >>> +static int panthor_fw_load_section_entry(struct panthor_device *ptdev, >>> + const struct firmware *fw, >>> + struct panthor_fw_binary_iter *iter, >>> + u32 ehdr) >>> +{ >>> + struct panthor_fw_binary_section_entry_hdr hdr; >>> + struct panthor_fw_section *section; >>> + u32 section_size; >>> + u32 name_len; >>> + int ret; >>> + >>> + ret = panthor_fw_binary_iter_read(ptdev, iter, &hdr, sizeof(hdr)); >>> + if (ret) >>> + return ret; >>> + >>> + if (hdr.data.end < hdr.data.start) { >>> + drm_err(&ptdev->base, "Firmware corrupted, data.end < data.start (0x%x < 0x%x)\n", >>> + hdr.data.end, hdr.data.start); >>> + return -EINVAL; >>> + } >>> + >>> + if (hdr.va.end < hdr.va.start) { >>> + drm_err(&ptdev->base, "Firmware corrupted, hdr.va.end < hdr.va.start (0x%x < 0x%x)\n", >>> + hdr.va.end, hdr.va.start); >>> + return -EINVAL; >>> + } >>> + >>> + if (hdr.data.end > fw->size) { >>> + drm_err(&ptdev->base, "Firmware corrupted, file truncated? data_end=0x%x > fw size=0x%zx\n", >>> + hdr.data.end, fw->size); >>> + return -EINVAL; >>> + } >>> + >>> + if ((hdr.va.start & ~PAGE_MASK) != 0 || >>> + (hdr.va.end & ~PAGE_MASK) != 0) { >>> + drm_err(&ptdev->base, "Firmware corrupted, virtual addresses not page aligned: 0x%x-0x%x\n", >>> + hdr.va.start, hdr.va.end); >>> + return -EINVAL; >>> + } >>> + >>> + if (hdr.flags & ~CSF_FW_BINARY_IFACE_ENTRY_RD_SUPPORTED_FLAGS) { >>> + drm_err(&ptdev->base, "Firmware contains interface with unsupported flags (0x%x)\n", >>> + hdr.flags); >>> + return -EINVAL; >>> + } >>> + >>> + if (hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_PROT) { >>> + drm_warn(&ptdev->base, >>> + "Firmware protected mode entry not be supported, ignoring"); >>> + return 0; >>> + } >>> + >>> + if (hdr.va.start == CSF_MCU_SHARED_REGION_START && >>> + !(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED)) { >>> + drm_err(&ptdev->base, >>> + "Interface at 0x%llx must be shared", CSF_MCU_SHARED_REGION_START); >>> + return -EINVAL; >>> + } >>> + >>> + name_len = iter->size - iter->offset; >>> + >>> + section = drmm_kzalloc(&ptdev->base, sizeof(*section), GFP_KERNEL); >>> + if (!section) >>> + return -ENOMEM; >>> + >>> + list_add_tail(§ion->node, &ptdev->fw->sections); >>> + section->flags = hdr.flags; >>> + section->data.size = hdr.data.end - hdr.data.start; >>> + >>> + if (section->data.size > 0) { >>> + void *data = drmm_kmalloc(&ptdev->base, section->data.size, GFP_KERNEL); >>> + >>> + if (!data) >>> + return -ENOMEM; >>> + >>> + memcpy(data, fw->data + hdr.data.start, section->data.size); >>> + section->data.buf = data; >>> + } >>> + >>> + if (name_len > 0) { >>> + char *name = drmm_kmalloc(&ptdev->base, name_len + 1, GFP_KERNEL); >>> + >>> + if (!name) >>> + return -ENOMEM; >>> + >>> + memcpy(name, iter->data + iter->offset, name_len); >>> + name[name_len] = '\0'; >>> + section->name = name; >>> + } >>> + >>> + section_size = hdr.va.end - hdr.va.start; >>> + if (section_size) { >>> + u32 cache_mode = hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_MASK; >>> + u32 vm_map_flags = 0; >>> + struct sg_table *sgt; >>> + u64 va = hdr.va.start; >>> + >>> + if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_WR)) >>> + vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_READONLY; >>> + >>> + if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_EX)) >>> + vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC; >>> + >>> + /* TODO: CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_*_COHERENT are mapped to >>> + * non-cacheable for now. We might want to introduce a new >>> + * IOMMU_xxx flag (or abuse IOMMU_MMIO, which maps to device >>> + * memory and is currently not used by our driver) for >>> + * AS_MEMATTR_AARCH64_SHARED memory, so we can take benefit >>> + * of IO-coherent systems. >>> + */ >>> + if (cache_mode != CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED) >>> + vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED; >>> + >>> + /* Shared section is in the auto-VA range. We need to >>> + * reserve the VA range so it's not allocated to someone else. >>> + */ >>> + if (va >= CSF_MCU_SHARED_REGION_START && >>> + va < CSF_MCU_SHARED_REGION_START + CSF_MCU_SHARED_REGION_SIZE) >>> + va = PANTHOR_GEM_ALLOC_VA; >>> + >>> + section->mem = panthor_fw_mem_alloc(ptdev, section_size, >>> + DRM_PANTHOR_BO_NO_MMAP, >>> + vm_map_flags, va); >>> + if (IS_ERR(section->mem)) >>> + return PTR_ERR(section->mem); >>> + >>> + if (drm_WARN_ON(&ptdev->base, section->mem->va != hdr.va.start)) >>> + return -EINVAL; >>> + >>> + panthor_fw_init_section_mem(ptdev, section); >>> + >>> + sgt = drm_gem_shmem_get_pages_sgt(§ion->mem->bo->base); >>> + if (IS_ERR(sgt)) >>> + return PTR_ERR(section->mem); >>> + >>> + dma_sync_sgtable_for_device(ptdev->base.dev, sgt, DMA_TO_DEVICE); >>> + >>> + if (section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED) { >>> + if (!panthor_fw_mem_vmap(section->mem)) >> >> Moving this before panthor_fw_init_section_mem() would avoid an >> unnecessary unmap/remap - althought this isn't exactly a performance path... > > Sure, I can do that. > >> >>> + return -ENOMEM; >>> + } >>> + } >>> + >>> + if (hdr.va.start == CSF_MCU_SHARED_REGION_START) >>> + ptdev->fw->shared_section = section; >>> + >>> + return 0; >>> +} >>> + >>> +static void >>> +panthor_reload_fw_sections(struct panthor_device *ptdev, bool full_reload) >>> +{ >>> + struct panthor_fw_section *section; >>> + >>> + list_for_each_entry(section, &ptdev->fw->sections, node) { >>> + struct sg_table *sgt; >>> + >>> + if (!full_reload && !(section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_WR)) >>> + continue; >>> + >>> + panthor_fw_init_section_mem(ptdev, section); >>> + sgt = drm_gem_shmem_get_pages_sgt(§ion->mem->bo->base); >>> + if (!drm_WARN_ON(&ptdev->base, IS_ERR_OR_NULL(sgt))) >>> + dma_sync_sgtable_for_device(ptdev->base.dev, sgt, DMA_TO_DEVICE); >>> + } >>> +} >>> + >>> +static int panthor_fw_load_entry(struct panthor_device *ptdev, >>> + const struct firmware *fw, >>> + struct panthor_fw_binary_iter *iter) >>> +{ >>> + struct panthor_fw_binary_iter eiter; >>> + u32 ehdr; >>> + int ret; >>> + >>> + ret = panthor_fw_binary_iter_read(ptdev, iter, &ehdr, sizeof(ehdr)); >>> + if (ret) >>> + return ret; >>> + >>> + if ((iter->offset % sizeof(u32)) || >>> + (CSF_FW_BINARY_ENTRY_SIZE(ehdr) % sizeof(u32))) { >>> + drm_err(&ptdev->base, "Firmware entry isn't 32 bit aligned, offset=0x%x size=0x%x\n", >>> + (u32)(iter->offset - sizeof(u32)), CSF_FW_BINARY_ENTRY_SIZE(ehdr)); >>> + return -EINVAL; >>> + } >>> + >>> + eiter.offset = 0; >>> + eiter.data = iter->data + iter->offset; >>> + eiter.size = CSF_FW_BINARY_ENTRY_SIZE(ehdr) - sizeof(ehdr); >>> + iter->offset += eiter.size; >> >> There should really be a check like: >> >> if (iter->offset < eiter.size) >> return -EINVAL; > > Uh, I thought I had added size checks everywhere, but I apparently > missed some places. > >> >> otherwise I think it's possible for a corrupt firmware to cause us to >> run off the end of the buffer. Ideally the check would look something >> more like the one in panthor_fw_binary_iter_read() (dealing with >> potential overflow). I'm wondering if it makes sense to allow >> panthor_fw_binary_iter_read() with a NULL 'out' and check the return >> value. That way we can replace "iter->offset += eiter.size" with: >> >> ret = panthor_fw_binary_iter_read(ptdev, iter, NULL, >> eiter.size); >> if (ret) >> return ret; >> >> (or have a new _skip() function) > > Might make sense to add a panthor_fw_binary_sub_iter_init() helper that > would take care of doing the size check on the main iter, Unless you > see other places requiring a size check that are not expressed as > sub-iterators. It was only the sub-iterators that I spotted the missing size check. A helper for the sub-iterators is probably more clear than my 'skip' function. [...] >>> +struct panthor_fw_ringbuf_input_iface { >>> + u64 insert; >>> + u64 extract; >>> +} __packed; >>> + >>> +struct panthor_fw_ringbuf_output_iface { >>> + u64 extract; >>> + u32 active; >>> +} __packed; >> >> Is there a good reason for these to be marked '__packed'? They are >> naturally aligned so there's no padding, and we guarantee they are page >> aligned. The compiler might have more freedom if they are not marked >> __packed. > > Nope, no good reason. > >> >>> + >>> +struct panthor_fw_cs_control_iface { >>> +#define CS_FEATURES_WORK_REGS(x) (((x) & GENMASK(7, 0)) + 1) >>> +#define CS_FEATURES_SCOREBOARDS(x) (((x) & GENMASK(15, 8)) >> 8) >>> +#define CS_FEATURES_COMPUTE BIT(16) >>> +#define CS_FEATURES_FRAGMENT BIT(17) >>> +#define CS_FEATURES_TILER BIT(18) >>> + u32 features; >>> + u32 input_va; >>> + u32 output_va; >>> +} __packed; >> >> Here I have to admit I can't find a statement in the spec saying that >> the stride must be a multiple of 4 bytes... but kbase makes that assumption. > > The stride of? The stride of this structure (panthor_fw_cs_control_iface or STREAM_CONTROL_BLOCK in the spec). The stride is defined by GROUP_CONTROL_BLOCK::GROUP_STREAM_STRIDE (panthor_fw_cs_control_iface->stream_stride here), but the spec doesn't specify that the FW must obey any restrictions on the stride. For that reason the use of __packed here is technically correct (the FW could choose a stride which causes this structure to be mis-aligned). In reality the firmware always aligns to 4 bytes and kbase depends on this. And I've raised this internally, so hopefully a future spec will include the 4 byte alignment requirement. TLDR; the __packed specifiers shouldn't be needed on any of these structures. >> >>> + >>> +struct panthor_fw_cs_input_iface { >>> +#define CS_STATE_MASK GENMASK(2, 0) >>> +#define CS_STATE_STOP 0 >>> +#define CS_STATE_START 1 >>> +#define CS_EXTRACT_EVENT BIT(4) >>> +#define CS_IDLE_SYNC_WAIT BIT(8) >>> +#define CS_IDLE_PROTM_PENDING BIT(9) >>> +#define CS_IDLE_EMPTY BIT(10) >>> +#define CS_IDLE_RESOURCE_REQ BIT(11) >>> +#define CS_TILER_OOM BIT(26) >>> +#define CS_PROTM_PENDING BIT(27) >>> +#define CS_FATAL BIT(30) >>> +#define CS_FAULT BIT(31) >>> +#define CS_REQ_MASK (CS_STATE_MASK | \ >>> + CS_EXTRACT_EVENT | \ >>> + CS_IDLE_SYNC_WAIT | \ >>> + CS_IDLE_PROTM_PENDING | \ >>> + CS_IDLE_EMPTY | \ >>> + CS_IDLE_RESOURCE_REQ) >>> +#define CS_EVT_MASK (CS_TILER_OOM | \ >>> + CS_PROTM_PENDING | \ >>> + CS_FATAL | \ >>> + CS_FAULT) >>> + u32 req; >>> + >>> +#define CS_CONFIG_PRIORITY(x) ((x) & GENMASK(3, 0)) >>> +#define CS_CONFIG_DOORBELL(x) (((x) << 8) & GENMASK(15, 8)) >>> + u32 config; >>> + u32 reserved1; >>> + u32 ack_irq_mask; >>> + u64 ringbuf_base; >>> + u32 ringbuf_size; >>> + u32 reserved2; >>> + u64 heap_start; >>> + u64 heap_end; >>> + u64 ringbuf_input; >>> + u64 ringbuf_output; >>> + u32 instr_config; >>> + u32 instrbuf_size; >>> + u64 instrbuf_base; >>> + u64 instrbuf_offset_ptr; >>> +} __packed; >> >> The spec says this has a minimal alignment of 64 bytes. Although I guess >> the code should check this if we remove __packed and rely on it. > > The allocation granularity is 4k, and we're not even in control of the > offset inside the FW interface section. So yes, we can check it when > parsing the FW sections, but there's no point adding __aligned() here. Sorry, no I wasn't intending that we'd add __aligned() - I was just trying to justify (to myself) that the __packed wasn't necessary. >> >>> + >>> +struct panthor_fw_cs_output_iface { >>> + u32 ack; >>> + u32 reserved1[15]; >>> + u64 status_cmd_ptr; >>> + >>> +#define CS_STATUS_WAIT_SB_MASK GENMASK(15, 0) >>> +#define CS_STATUS_WAIT_SB_SRC_MASK GENMASK(19, 16) >>> +#define CS_STATUS_WAIT_SB_SRC_NONE (0 << 16) >>> +#define CS_STATUS_WAIT_SB_SRC_WAIT (8 << 16) >>> +#define CS_STATUS_WAIT_SYNC_COND_LE (0 << 24) >>> +#define CS_STATUS_WAIT_SYNC_COND_GT (1 << 24) >>> +#define CS_STATUS_WAIT_SYNC_COND_MASK GENMASK(27, 24) >>> +#define CS_STATUS_WAIT_PROGRESS BIT(28) >>> +#define CS_STATUS_WAIT_PROTM BIT(29) >>> +#define CS_STATUS_WAIT_SYNC_64B BIT(30) >>> +#define CS_STATUS_WAIT_SYNC BIT(31) >>> + u32 status_wait; >>> + u32 status_req_resource; >>> + u64 status_wait_sync_ptr; >>> + u32 status_wait_sync_value; >>> + u32 status_scoreboards; >>> + >>> +#define CS_STATUS_BLOCKED_REASON_UNBLOCKED 0 >>> +#define CS_STATUS_BLOCKED_REASON_SB_WAIT 1 >>> +#define CS_STATUS_BLOCKED_REASON_PROGRESS_WAIT 2 >>> +#define CS_STATUS_BLOCKED_REASON_SYNC_WAIT 3 >>> +#define CS_STATUS_BLOCKED_REASON_DEFERRED 5 >>> +#define CS_STATUS_BLOCKED_REASON_RES 6 >>> +#define CS_STATUS_BLOCKED_REASON_FLUSH 7 >>> +#define CS_STATUS_BLOCKED_REASON_MASK GENMASK(3, 0) >>> + u32 status_blocked_reason; >>> + u32 status_wait_sync_value_hi; >>> + u32 reserved2[6]; >>> + >>> +#define CS_EXCEPTION_TYPE(x) ((x) & GENMASK(7, 0)) >>> +#define CS_EXCEPTION_DATA(x) (((x) >> 8) & GENMASK(23, 0)) >>> + u32 fault; >>> + u32 fatal; >>> + u64 fault_info; >>> + u64 fatal_info; >>> + u32 reserved3[10]; >>> + u32 heap_vt_start; >>> + u32 heap_vt_end; >>> + u32 reserved4; >>> + u32 heap_frag_end; >>> + u64 heap_address; >>> +} __packed; >> >> output is the same as input. > > You mean in term of alignment? Yep. (Sorry I did a terrible job of explaining myself here - I got rather distracted trying to work out what alignment was guaranteed by the spec for all these different structures). >> >>> + >>> +struct panthor_fw_csg_control_iface { >>> + u32 features; >>> + u32 input_va; >>> + u32 output_va; >>> + u32 suspend_size; >>> + u32 protm_suspend_size; >>> + u32 stream_num; >>> + u32 stream_stride; >>> +} __packed; >> >> The spec is ambigious here. It one place it states the stride is 256 >> bytes, but in another that you need to look at the GLB_GROUP_STRIDE >> value. In practice we can rely on 4 byte alignment. >> >> I'm beginning to wonder if it's worth worrying about, I think I'll stop >> here ;) > > Hehe. I'll add checks where I can in the parsing logic. I guess having > things naturally aligned and making sure there's no overlap with other > interfaces is a minimum. Yes that would be good, and like I said there should be a clarification in later specs that everything is (at least) 4 byte aligned. Apparently the 256 byte stride mentioned in one place was due to the way the structure was expressed in the XML and the XML->HTML tool calculating it. Or in one word: 'wrong'! ;) Steve