Re: [PATCH v2 04/15] drm/panthor: Add the device logical block

Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> · Tue, 29 Aug 2023 16:00:35 +0200

On Fri, 11 Aug 2023 16:47:56 +0100
Steven Price <steven.price@xxxxxxx> wrote:

> On 09/08/2023 17:53, Boris Brezillon wrote:
> > The panthor driver is designed in a modular way, where each logical
> > block is dealing with a specific HW-block or software feature. In order
> > for those blocks to communicate with each other, we need a central
> > panthor_device collecting all the blocks, and exposing some common
> > features, like interrupt handling, power management, reset, ...
> > 
> > This what this panthor_device logical block is about.
> > 
> > v2:
> > - Rename the driver (pancsf -> panthor)
> > - Change the license (GPL2 -> MIT + GPL2)
> > - Split the driver addition commit
> > - Add devfreq/PM support
> > - Use drm_dev_{unplug,enter,exit}() to provide safe device removal
> > 
> > Signed-off-by: Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx>
> > ---
> >  drivers/gpu/drm/panthor/panthor_device.c | 479 +++++++++++++++++++++++
> >  drivers/gpu/drm/panthor/panthor_device.h | 354 +++++++++++++++++
> >  2 files changed, 833 insertions(+)
> >  create mode 100644 drivers/gpu/drm/panthor/panthor_device.c
> >  create mode 100644 drivers/gpu/drm/panthor/panthor_device.h
> > 
> > diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
> > new file mode 100644
> > index 000000000000..15f102116fa0
> > --- /dev/null
> > +++ b/drivers/gpu/drm/panthor/panthor_device.c
> > @@ -0,0 +1,479 @@
> > +// SPDX-License-Identifier: GPL-2.0 or MIT
> > +/* Copyright 2018 Marty E. Plummer <hanetzer@xxxxxxxxxxxxx> */
> > +/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@xxxxxxxxxx> */
> > +/* Copyright 2023 Collabora ltd. */
> > +
> > +#include <linux/clk.h>
> > +#include <linux/reset.h>
> > +#include <linux/platform_device.h>
> > +#include <linux/pm_domain.h>
> > +#include <linux/pm_runtime.h>
> > +#include <linux/regulator/consumer.h>
> > +
> > +#include <drm/drm_drv.h>
> > +#include <drm/drm_managed.h>
> > +
> > +#include "panthor_sched.h"
> > +#include "panthor_device.h"
> > +#include "panthor_devfreq.h"
> > +#include "panthor_gpu.h"
> > +#include "panthor_fw.h"
> > +#include "panthor_mmu.h"
> > +#include "panthor_regs.h"
> > +
> > +static int panthor_clk_init(struct panthor_device *ptdev)
> > +{
> > +	ptdev->clks.core = devm_clk_get(ptdev->base.dev, NULL);
> > +	if (IS_ERR(ptdev->clks.core)) {
> > +		drm_err(&ptdev->base, "get 'core' clock failed %ld\n",
> > +			PTR_ERR(ptdev->clks.core));  
> 
> I suspect it would be a good idea to use dev_err_probe() here (and
> below) as I believe devm_clk_get can return -EPROBE_DEFER.

Nice, didn't know there was a logging function that was silencing
probe-defer errors.

> 
> > +		return PTR_ERR(ptdev->clks.core);
> > +	}
> > +
> > +	ptdev->clks.stacks = devm_clk_get_optional(ptdev->base.dev, "stacks");
> > +	if (IS_ERR(ptdev->clks.stacks)) {
> > +		drm_err(&ptdev->base, "get 'stacks' clock failed %ld\n",
> > +			PTR_ERR(ptdev->clks.stacks));
> > +		return PTR_ERR(ptdev->clks.stacks);
> > +	}
> > +
> > +	ptdev->clks.coregroup = devm_clk_get_optional(ptdev->base.dev, "coregroup");
> > +	if (IS_ERR(ptdev->clks.coregroup)) {
> > +		drm_err(&ptdev->base, "get 'coregroup' clock failed %ld\n",
> > +			PTR_ERR(ptdev->clks.coregroup));
> > +		return PTR_ERR(ptdev->clks.coregroup);
> > +	}
> > +
> > +	drm_info(&ptdev->base, "clock rate = %lu\n", clk_get_rate(ptdev->clks.core));
> > +	return 0;
> > +}
> > +
> > +void panthor_device_unplug(struct panthor_device *ptdev)
> > +{
> > +	/* FIXME: This is racy. */  
> 
> Can we fix this? From a quick look it seems like a sequence like below
> should avoid the race.
> 
> 	if (!drm_dev_enter())
> 		/* Already unplugged */
> 		return;
> 	ptdev->base.unplugged = true;
> 	drm_dev_exit();
> 
> Although possibly that should be in the DRM core rather than open-coded
> here.

Are you sure that's protecting us against two concurrent calls to
drm_dev_unplug() (drm_dev_enter() is taking a read-lock)? And that's not
the only thing I need actually. If there are 2 threads entering
panthor_device_unplug(), I need to make sure the one who losts (arrived
after unplugged was set to false) is waiting for all operations after
the drm_dev_unplug() call to be done, otherwise we might return from
platform_driver->remove() before the unplug cleanups are done, and
there might still be threads/workqueues accessing device resources
while/after they get released by the device-model.

> 
> > +	if (drm_dev_is_unplugged(&ptdev->base))
> > +		return;
> > +
> > +	drm_WARN_ON(&ptdev->base, pm_runtime_get_sync(ptdev->base.dev) < 0);
> > +
> > +	/* Call drm_dev_unplug() so any access to HW block happening after
> > +	 * that point get rejected.
> > +	 */
> > +	drm_dev_unplug(&ptdev->base);
> > +
> > +	/* Now, try to cleanly shutdown the GPU before the device resources
> > +	 * get reclaimed.
> > +	 */
> > +	panthor_sched_unplug(ptdev);
> > +	panthor_fw_unplug(ptdev);
> > +	panthor_mmu_unplug(ptdev);
> > +	panthor_gpu_unplug(ptdev);
> > +
> > +	pm_runtime_dont_use_autosuspend(ptdev->base.dev);
> > +	pm_runtime_put_sync_suspend(ptdev->base.dev);
> > +}
> > +
> > +static void panthor_device_reset_cleanup(struct drm_device *ddev, void *data)
> > +{
> > +	struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base);
> > +
> > +	cancel_work_sync(&ptdev->reset.work);
> > +	destroy_workqueue(ptdev->reset.wq);
> > +}
> > +
> > +static void panthor_device_reset_work(struct work_struct *work)
> > +{
> > +	struct panthor_device *ptdev = container_of(work, struct panthor_device, reset.work);
> > +	int ret, cookie;
> > +
> > +	if (!drm_dev_enter(&ptdev->base, &cookie))
> > +		return;
> > +
> > +	panthor_sched_pre_reset(ptdev);
> > +	panthor_fw_pre_reset(ptdev, true);
> > +	panthor_mmu_pre_reset(ptdev);
> > +	panthor_gpu_soft_reset(ptdev);
> > +	panthor_gpu_l2_power_on(ptdev);
> > +	panthor_mmu_post_reset(ptdev);
> > +	ret = panthor_fw_post_reset(ptdev);
> > +	if (ret)
> > +		goto out;
> > +
> > +	atomic_set(&ptdev->reset.pending, 0);
> > +	panthor_sched_post_reset(ptdev);
> > +	drm_dev_exit(cookie);
> > +
> > +out:
> > +	if (ret) {  
> 
> This looks like a race condition too - is there a need for a
> drm_dev_exit_and_unplug() function?

drm_dev_exit() is just releasing the read-lock. drm_dev_unplug()
waits for all readers to be done and sets the unplugged value to true.
So we only get readers/writer synchronization here, but nothing doing
writer/writer sync. I guess the drm core leaves that to drivers, given
drm_dev_unplug() is usually called from xxx_driver->remove() hook, on
which serialization is guaranteed by the device-model.

TLDR; yes, it's racy, but I don't think drm_dev_exit_and_unplug() would
help solve the existing race.

It's worth noting that we currently have only 2 paths calling
panthor_device_unplug(): the platform_driver->remove() hook and the
reset worker. Calling drm_dev_unplug() might not be the right thing to
do, I just thought it was a good match to reflect the fact the device
becomes inaccessible, without adding yet another kind of device-lost
field.

> 
> > +		panthor_device_unplug(ptdev);
> > +		drm_err(&ptdev->base, "Failed to boot MCU after reset, making device unusable.");
> > +	}
> > +}
> > +
> > +static bool panthor_device_is_initialized(struct panthor_device *ptdev)
> > +{
> > +	return !!ptdev->scheduler;
> > +}
> > +
> > +static void panthor_device_free_page(struct drm_device *ddev, void *data)
> > +{
> > +	free_page((unsigned long)data);
> > +}
> > +
> > +int panthor_device_init(struct panthor_device *ptdev)
> > +{
> > +	struct resource *res;
> > +	struct page *p;
> > +	int ret;
> > +
> > +	ptdev->coherent = device_get_dma_attr(ptdev->base.dev) == DEV_DMA_COHERENT;
> > +
> > +	drmm_mutex_init(&ptdev->base, &ptdev->pm.lock);
> > +	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED);
> > +	p = alloc_page(GFP_KERNEL | __GFP_ZERO);
> > +	if (!p)
> > +		return -ENOMEM;
> > +
> > +	ptdev->pm.dummy_latest_flush = page_address(p);
> > +	ret = drmm_add_action_or_reset(&ptdev->base, panthor_device_free_page,
> > +				       ptdev->pm.dummy_latest_flush);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* Set the dummy page to the default LATEST_FLUSH value. This
> > +	 * will be updated on the next suspend.
> > +	 */
> > +	*ptdev->pm.dummy_latest_flush = CSF_GPU_LATEST_FLUSH_ID_DEFAULT;  
> 
> I see why this register default value was defined. Although I'm not sure
> it has any benefit over just using zero... If the GPU is off when user
> space reads the FLUSH_ID then the GPU's caches are definitely empty so
> any flush ID is valid.

Zero means we'll force a cache flush for all CS that were created while
the device was suspended, that's not ideal.

> 
> Interestingly looking at kbase it seems to use an initial value of 1
> (POWER_DOWN_LATEST_FLUSH_VALUE). I guess zero is less ideal because
> FLUSH_CACHE2 would then unconditionally do a flush.

I guess a value of 1 would work. It just means we'll get a spurious
flush if the CS is submitted after 32 flushes happened, on the other
hand we also a spurious flush on the first submitted CS when we use
POWER_DOWN_LATEST_FLUSH_VALUE. I'll switch to 1, drop the default def,
and update the comment accordingly.

> 
> > +
> > +	INIT_WORK(&ptdev->reset.work, panthor_device_reset_work);
> > +	ptdev->reset.wq = alloc_ordered_workqueue("panthor-reset-wq", 0);
> > +	if (!ptdev->reset.wq)
> > +		return -ENOMEM;
> > +
> > +	ret = drmm_add_action_or_reset(&ptdev->base, panthor_device_reset_cleanup, NULL);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = panthor_clk_init(ptdev);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = panthor_devfreq_init(ptdev);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ptdev->iomem = devm_platform_get_and_ioremap_resource(to_platform_device(ptdev->base.dev),
> > +							      0, &res);
> > +	if (IS_ERR(ptdev->iomem))
> > +		return PTR_ERR(ptdev->iomem);
> > +
> > +	ptdev->phys_addr = res->start;
> > +
> > +	ret = devm_pm_runtime_enable(ptdev->base.dev);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = pm_runtime_resume_and_get(ptdev->base.dev);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = panthor_gpu_init(ptdev);
> > +	if (ret)
> > +		goto err_rpm_put;
> > +
> > +	ret = panthor_mmu_init(ptdev);
> > +	if (ret)
> > +		goto err_rpm_put;
> > +
> > +	ret = panthor_fw_init(ptdev);
> > +	if (ret)
> > +		goto err_rpm_put;
> > +
> > +	ret = panthor_sched_init(ptdev);
> > +	if (ret)
> > +		goto err_rpm_put;
> > +
> > +	/* ~3 frames */
> > +	pm_runtime_set_autosuspend_delay(ptdev->base.dev, 50);
> > +	pm_runtime_use_autosuspend(ptdev->base.dev);
> > +	pm_runtime_put_autosuspend(ptdev->base.dev);
> > +	return 0;
> > +
> > +err_rpm_put:
> > +	pm_runtime_put_sync_suspend(ptdev->base.dev);
> > +	return ret;
> > +}
> > +
> > +#define PANTHOR_EXCEPTION(id) \
> > +	[DRM_PANTHOR_EXCEPTION_ ## id] = { \
> > +		.name = #id, \
> > +	}
> > +
> > +struct panthor_exception_info {
> > +	const char *name;
> > +};
> > +
> > +static const struct panthor_exception_info panthor_exception_infos[] = {
> > +	PANTHOR_EXCEPTION(OK),
> > +	PANTHOR_EXCEPTION(TERMINATED),
> > +	PANTHOR_EXCEPTION(KABOOM),
> > +	PANTHOR_EXCEPTION(EUREKA),
> > +	PANTHOR_EXCEPTION(ACTIVE),
> > +	PANTHOR_EXCEPTION(CS_RES_TERM),
> > +	PANTHOR_EXCEPTION(CS_CONFIG_FAULT),
> > +	PANTHOR_EXCEPTION(CS_ENDPOINT_FAULT),
> > +	PANTHOR_EXCEPTION(CS_BUS_FAULT),
> > +	PANTHOR_EXCEPTION(CS_INSTR_INVALID),
> > +	PANTHOR_EXCEPTION(CS_CALL_STACK_OVERFLOW),
> > +	PANTHOR_EXCEPTION(CS_INHERIT_FAULT),
> > +	PANTHOR_EXCEPTION(INSTR_INVALID_PC),
> > +	PANTHOR_EXCEPTION(INSTR_INVALID_ENC),
> > +	PANTHOR_EXCEPTION(INSTR_BARRIER_FAULT),
> > +	PANTHOR_EXCEPTION(DATA_INVALID_FAULT),
> > +	PANTHOR_EXCEPTION(TILE_RANGE_FAULT),
> > +	PANTHOR_EXCEPTION(ADDR_RANGE_FAULT),
> > +	PANTHOR_EXCEPTION(IMPRECISE_FAULT),
> > +	PANTHOR_EXCEPTION(OOM),
> > +	PANTHOR_EXCEPTION(CSF_FW_INTERNAL_ERROR),
> > +	PANTHOR_EXCEPTION(CSF_RES_EVICTION_TIMEOUT),
> > +	PANTHOR_EXCEPTION(GPU_BUS_FAULT),
> > +	PANTHOR_EXCEPTION(GPU_SHAREABILITY_FAULT),
> > +	PANTHOR_EXCEPTION(SYS_SHAREABILITY_FAULT),
> > +	PANTHOR_EXCEPTION(GPU_CACHEABILITY_FAULT),
> > +	PANTHOR_EXCEPTION(TRANSLATION_FAULT_0),
> > +	PANTHOR_EXCEPTION(TRANSLATION_FAULT_1),
> > +	PANTHOR_EXCEPTION(TRANSLATION_FAULT_2),
> > +	PANTHOR_EXCEPTION(TRANSLATION_FAULT_3),
> > +	PANTHOR_EXCEPTION(TRANSLATION_FAULT_4),
> > +	PANTHOR_EXCEPTION(PERM_FAULT_0),
> > +	PANTHOR_EXCEPTION(PERM_FAULT_1),
> > +	PANTHOR_EXCEPTION(PERM_FAULT_2),
> > +	PANTHOR_EXCEPTION(PERM_FAULT_3),
> > +	PANTHOR_EXCEPTION(ACCESS_FLAG_1),
> > +	PANTHOR_EXCEPTION(ACCESS_FLAG_2),
> > +	PANTHOR_EXCEPTION(ACCESS_FLAG_3),
> > +	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_IN),
> > +	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT0),
> > +	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT1),
> > +	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT2),
> > +	PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT3),
> > +	PANTHOR_EXCEPTION(MEM_ATTR_FAULT_0),
> > +	PANTHOR_EXCEPTION(MEM_ATTR_FAULT_1),
> > +	PANTHOR_EXCEPTION(MEM_ATTR_FAULT_2),
> > +	PANTHOR_EXCEPTION(MEM_ATTR_FAULT_3),
> > +};
> > +
> > +const char *panthor_exception_name(struct panthor_device *ptdev, u32 exception_code)
> > +{
> > +	if (drm_WARN_ON(&ptdev->base,  
> 
> I'm not convinced this should be a WARN_ON as I suspect it's probably
> possible to inject values from user space (although I'm not completely
> sure on that).

Normally no (it's something returned by the FW), unless userspace gets
access to the kernel <-> FW interface, which would be worrisome :-).

> It's certainly not a driver error as such if we can't
> decode the value.

Ack on dropping the WARN_ON().

> 
> > +			exception_code >= ARRAY_SIZE(panthor_exception_infos) ||
> > +			!panthor_exception_infos[exception_code].name))
> > +		return "Unknown exception type";
> > +
> > +	return panthor_exception_infos[exception_code].name;
> > +}
> > +
> > +static vm_fault_t panthor_mmio_vm_fault(struct vm_fault *vmf)
> > +{
> > +	struct vm_area_struct *vma = vmf->vma;
> > +	struct panthor_device *ptdev = vma->vm_private_data;
> > +	u64 id = vma->vm_pgoff << PAGE_SHIFT;
> > +	unsigned long pfn;
> > +	pgprot_t pgprot;
> > +	vm_fault_t ret;
> > +	bool active;
> > +	int cookie;
> > +
> > +	if (!drm_dev_enter(&ptdev->base, &cookie))
> > +		return VM_FAULT_SIGBUS;
> > +
> > +	mutex_lock(&ptdev->pm.lock);
> > +	active = atomic_read(&ptdev->pm.state) == PANTHOR_DEVICE_PM_STATE_ACTIVE;
> > +
> > +	switch (id) {
> > +	case DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET:
> > +		if (active)
> > +			pfn = __phys_to_pfn(ptdev->phys_addr + CSF_GPU_LATEST_FLUSH_ID);
> > +		else
> > +			pfn = virt_to_pfn(ptdev->pm.dummy_latest_flush);
> > +		break;
> > +
> > +	default:
> > +		ret = VM_FAULT_SIGBUS;
> > +		goto out_unlock;
> > +	}
> > +
> > +	pgprot = vma->vm_page_prot;
> > +	if (active)
> > +		pgprot = pgprot_noncached(pgprot);
> > +
> > +	ret = vmf_insert_pfn_prot(vma, vmf->address, pfn, pgprot);
> > +
> > +out_unlock:
> > +	mutex_unlock(&ptdev->pm.lock);
> > +	drm_dev_exit(cookie);
> > +	return ret;
> > +}
> > +
> > +static const struct vm_operations_struct panthor_mmio_vm_ops = {
> > +	.fault = panthor_mmio_vm_fault,
> > +};
> > +
> > +int panthor_device_mmap_io(struct panthor_device *ptdev, struct vm_area_struct *vma)
> > +{
> > +	u64 id = vma->vm_pgoff << PAGE_SHIFT;
> > +
> > +	switch (id) {
> > +	case DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET:
> > +		if (vma->vm_end - vma->vm_start != PAGE_SIZE ||
> > +		    (vma->vm_flags & (VM_WRITE | VM_EXEC)))
> > +			return -EINVAL;
> > +
> > +		break;
> > +
> > +	default:
> > +		return -EINVAL;
> > +	}
> > +
> > +	/* Defer actual mapping to the fault handler. */
> > +	vma->vm_private_data = ptdev;
> > +	vma->vm_ops = &panthor_mmio_vm_ops;
> > +	vm_flags_set(vma,
> > +		     VM_IO | VM_DONTCOPY | VM_DONTEXPAND |
> > +		     VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP);
> > +	return 0;
> > +}
> > +
> > +#ifdef CONFIG_PM
> > +int panthor_device_resume(struct device *dev)
> > +{
> > +	struct panthor_device *ptdev = dev_get_drvdata(dev);
> > +	int ret, cookie;
> > +
> > +	mutex_lock(&ptdev->pm.lock);
> > +	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_RESUMING);
> > +
> > +	ret = clk_prepare_enable(ptdev->clks.core);
> > +	if (ret)
> > +		goto err_unlock;
> > +
> > +	ret = clk_prepare_enable(ptdev->clks.stacks);
> > +	if (ret)
> > +		goto err_disable_core_clk;
> > +
> > +	ret = clk_prepare_enable(ptdev->clks.coregroup);
> > +	if (ret)
> > +		goto err_disable_stacks_clk;
> > +
> > +	ret = panthor_devfreq_resume(ptdev);
> > +	if (ret)
> > +		goto err_disable_coregroup_clk;
> > +
> > +	if (panthor_device_is_initialized(ptdev) &&
> > +	    drm_dev_enter(&ptdev->base, &cookie)) {
> > +		panthor_gpu_resume(ptdev);
> > +		panthor_mmu_resume(ptdev);
> > +		ret = drm_WARN_ON(&ptdev->base, panthor_fw_resume(ptdev));
> > +		if (!ret)
> > +			panthor_sched_resume(ptdev);
> > +
> > +		drm_dev_exit(cookie);
> > +
> > +		if (ret)
> > +			goto err_devfreq_suspend;
> > +	}
> > +
> > +	/* Clear all IOMEM mappings pointing to this device after we've
> > +	 * resumed. This way the fake mappings pointing to the dummy pages
> > +	 * are removed and the real iomem mapping will be restored on next
> > +	 * access.
> > +	 */
> > +	unmap_mapping_range(ptdev->base.anon_inode->i_mapping,
> > +			    DRM_PANTHOR_USER_MMIO_OFFSET, 0, 1);
> > +	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_ACTIVE);  
> 
> Is the ordering here correct? I think we need to set ACTIVE before the
> unmap_mapping_range otherwise there is a (very small) race where user
> space could fault the page (and get the dummy mapping) before the
> atomic_set.

We take the pm.lock in panthor_mmio_vm_fault().

> 
> Hmm, actually we have the pm.lock, so no this isn't racy. In which case
> is there a good reason that you're using atomics? I can see two accesses
> which aren't protected by pm.lock:
> 
>   * the early out in panthor_device_suspend() - which could easily be
> moved inside the lock.

When we're in suspend() we are the one in control of the pm.state, so
no race expected here.

> 
>   * panthor_device_schedule_reset() - this looks racy (the power down
> could happen immediately after the atomic_read()), so I suspect it would
> be better moving the check into panthor_device_reset_work() and
> performing it with the pm.lock held.

I think the main reason for it being an atomic is because I didn't
have PM locking in the initial implementation, but I ended adding
locking at some point because I didn't really have choice. I thought
the race didn't exist because of the workqueue synchronization/work
cancellation that happens in panthor_sched_suspend(), but I see now
that it's not protecting us (thread queuing the job could be paused
just after checking the PM state and resumed after the suspend
happened). This being said, we might have a lock ordering issue if we
take the PM lock in that path (I need to check that).

> 
> > +	if (atomic_read(&ptdev->reset.pending))
> > +		queue_work(ptdev->reset.wq, &ptdev->reset.work);
> > +
> > +	mutex_unlock(&ptdev->pm.lock);
> > +	return 0;
> > +
> > +err_devfreq_suspend:
> > +	panthor_devfreq_suspend(ptdev);
> > +
> > +err_disable_coregroup_clk:
> > +	clk_disable_unprepare(ptdev->clks.coregroup);
> > +
> > +err_disable_stacks_clk:
> > +	clk_disable_unprepare(ptdev->clks.stacks);
> > +
> > +err_disable_core_clk:
> > +	clk_disable_unprepare(ptdev->clks.core);
> > +
> > +err_unlock:
> > +	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED);
> > +	mutex_unlock(&ptdev->pm.lock);
> > +	return ret;
> > +}
> > +
> > +int panthor_device_suspend(struct device *dev)
> > +{
> > +	struct panthor_device *ptdev = dev_get_drvdata(dev);
> > +	int ret, cookie;
> > +
> > +	if (atomic_read(&ptdev->pm.state) != PANTHOR_DEVICE_PM_STATE_ACTIVE)
> > +		return 0;
> > +
> > +	mutex_lock(&ptdev->pm.lock);
> > +	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDING);
> > +
> > +	/* Clear all IOMEM mappings pointing to this device before we
> > +	 * shutdown the power-domain and clocks. Failing to do that results
> > +	 * in external aborts when the process accesses the iomem region.
> > +	 */
> > +	unmap_mapping_range(ptdev->base.anon_inode->i_mapping,
> > +			    DRM_PANTHOR_USER_MMIO_OFFSET, 0, 1);
> > +
> > +	if (panthor_device_is_initialized(ptdev) &&
> > +	    drm_dev_enter(&ptdev->base, &cookie)) {
> > +		cancel_work_sync(&ptdev->reset.work);
> > +
> > +		/* We prepare everything as if we were resetting the GPU.
> > +		 * The end of the reset will happen in the resume path though.
> > +		 */
> > +		panthor_sched_suspend(ptdev);
> > +		panthor_fw_suspend(ptdev);
> > +		panthor_mmu_suspend(ptdev);
> > +		panthor_gpu_suspend(ptdev);
> > +		drm_dev_exit(cookie);
> > +	}
> > +
> > +	ret = panthor_devfreq_suspend(ptdev);
> > +	if (ret) {
> > +		if (panthor_device_is_initialized(ptdev) &&
> > +		    drm_dev_enter(&ptdev->base, &cookie)) {
> > +			panthor_gpu_resume(ptdev);
> > +			panthor_mmu_resume(ptdev);
> > +			drm_WARN_ON(&ptdev->base, panthor_fw_resume(ptdev));
> > +			panthor_sched_resume(ptdev);
> > +			drm_dev_exit(cookie);
> > +		}
> > +
> > +		atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_ACTIVE);
> > +		goto out_unlock;
> > +	}
> > +
> > +	/* Before we suspend, update the dummy_latest_flush page, so accesses
> > +	 * to this dummy page return the value the HW would have returned.
> > +	 */
> > +	*ptdev->pm.dummy_latest_flush = gpu_read(ptdev, CSF_GPU_LATEST_FLUSH_ID);  
> 
> As above, I don't believe it is important for user space to know the
> value the HW would have returned during a suspend. Indeed if the
> hardware was successfully suspended the flush ID is likely to be reset -
> so this would be inaccurate. However any value should be safe if the
> work was prepared while the GPU was off as the caches will be empty.

Agreed.

> 
> > +
> > +	clk_disable_unprepare(ptdev->clks.coregroup);
> > +	clk_disable_unprepare(ptdev->clks.stacks);
> > +	clk_disable_unprepare(ptdev->clks.core);
> > +	atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED);
> > +
> > +out_unlock:
> > +	mutex_unlock(&ptdev->pm.lock);
> > +	return ret;
> > +}
> > +#endif
> > diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
> > new file mode 100644
> > index 000000000000..e0e1be263eb9
> > --- /dev/null
> > +++ b/drivers/gpu/drm/panthor/panthor_device.h
> > @@ -0,0 +1,354 @@
> > +/* SPDX-License-Identifier: GPL-2.0 or MIT */
> > +/* Copyright 2018 Marty E. Plummer <hanetzer@xxxxxxxxxxxxx> */
> > +/* Copyright 2019 Linaro, Ltd, Rob Herring <robh@xxxxxxxxxx> */
> > +/* Copyright 2023 Collabora ltd. */
> > +
> > +#ifndef __PANTHOR_DEVICE_H__
> > +#define __PANTHOR_DEVICE_H__
> > +
> > +#include <linux/atomic.h>
> > +#include <linux/io-pgtable.h>
> > +#include <linux/regulator/consumer.h>
> > +#include <linux/spinlock.h>
> > +#include <drm/drm_device.h>
> > +#include <drm/drm_mm.h>
> > +#include <drm/gpu_scheduler.h>
> > +#include <drm/panthor_drm.h>
> > +
> > +struct panthor_csf;
> > +struct panthor_csf_ctx;
> > +struct panthor_device;
> > +struct panthor_gpu;
> > +struct panthor_group_pool;
> > +struct panthor_heap_pool;
> > +struct panthor_job;
> > +struct panthor_mmu;
> > +struct panthor_fw;
> > +struct panthor_perfcnt;
> > +struct panthor_vm;
> > +struct panthor_vm_pool;
> > +
> > +/**
> > + * enum panthor_device_pm_state - PM state
> > + */
> > +enum panthor_device_pm_state {
> > +	/** @PANTHOR_DEVICE_PM_STATE_SUSPENDED: Device is suspended. */
> > +	PANTHOR_DEVICE_PM_STATE_SUSPENDED = 0,
> > +
> > +	/** @PANTHOR_DEVICE_PM_STATE_RESUMING: Device is being resumed. */
> > +	PANTHOR_DEVICE_PM_STATE_RESUMING,
> > +
> > +	/** @PANTHOR_DEVICE_PM_STATE_ACTIVE: Device is active. */
> > +	PANTHOR_DEVICE_PM_STATE_ACTIVE,
> > +
> > +	/** @PANTHOR_DEVICE_PM_STATE_SUSPENDING: Device is being suspended. */
> > +	PANTHOR_DEVICE_PM_STATE_SUSPENDING,
> > +};
> > +
> > +/**
> > + * struct panthor_irq - IRQ data
> > + *
> > + * Used to automate IRQ handling for the 3 different IRQs we have in this driver.
> > + */
> > +struct panthor_irq {
> > +	/** @ptdev: Panthor device */
> > +	struct panthor_device *ptdev;
> > +
> > +	/** @irq: IRQ number. */
> > +	int irq;
> > +
> > +	/** @mask: Current mask being applied to xxx_INT_MASK. */
> > +	u32 mask;
> > +
> > +	/** @suspended: Set to true when the IRQ is suspended. */
> > +	atomic_t suspended;
> > +};
> > +
> > +/**
> > + * struct panthor_device - Panthor device
> > + */
> > +struct panthor_device {
> > +	/** @base: Base drm_device. */
> > +	struct drm_device base;
> > +
> > +	/** @phys_addr: Physical address of the iomem region. */
> > +	phys_addr_t phys_addr;
> > +
> > +	/** @iomem: CPU mapping of the IOMEM region. */
> > +	void __iomem *iomem;
> > +
> > +	/** @clks: GPU clocks. */
> > +	struct {
> > +		/** @core: Core clock. */
> > +		struct clk *core;
> > +
> > +		/** @stacks: Stacks clock. This clock is optional. */
> > +		struct clk *stacks;
> > +
> > +		/** @coregroup: Core group clock. This clock is optional. */
> > +		struct clk *coregroup;
> > +	} clks;
> > +
> > +	/** @coherent: True if the CPU/GPU are memory coherent. */
> > +	bool coherent;
> > +
> > +	/** @gpu_info: GPU information. */
> > +	struct drm_panthor_gpu_info gpu_info;
> > +
> > +	/** @csif_info: Command stream interface information. */
> > +	struct drm_panthor_csif_info csif_info;
> > +
> > +	/** @gpu: GPU management data. */
> > +	struct panthor_gpu *gpu;
> > +
> > +	/** @fw: FW management data. */
> > +	struct panthor_fw *fw;
> > +
> > +	/** @mmu: MMU management data. */
> > +	struct panthor_mmu *mmu;
> > +
> > +	/** @scheduler: Scheduler management data. */
> > +	struct panthor_scheduler *scheduler;
> > +
> > +	/** @devfreq: Device frequency scaling management data. */
> > +	struct panthor_devfreq *devfreq;
> > +
> > +	/** @reset: Reset related fields. */
> > +	struct {
> > +		/** @wq: Ordered worqueud used to schedule reset operations. */
> > +		struct workqueue_struct *wq;
> > +
> > +		/** @work: Reset work. */
> > +		struct work_struct work;
> > +
> > +		/** @pending: Set to true if a reset is pending. */
> > +		atomic_t pending;
> > +	} reset;
> > +
> > +	/** @pm: Power management related data. */
> > +	struct {
> > +		/** @state: Power state, see panthor_device_pm_state. */
> > +		atomic_t state;
> > +
> > +		/**
> > +		 * @lock: Lock protecting the suspend/resume operations.
> > +		 *
> > +		 * This is needed to ensure we map the dummy IO pages when
> > +		 * the device is being suspended, and the real IO pages when
> > +		 * the device is being resumed. We can't just do with the
> > +		 * state atomicity to deal with this race.
> > +		 */
> > +		struct mutex lock;
> > +
> > +		/**
> > +		 * @dummy_latest_flush: Dummy LATEST_FLUSH page.
> > +		 *
> > +		 * Used to replace the real LATEST_FLUSH page when the GPU
> > +		 * is suspended.
> > +		 */
> > +		u32 *dummy_latest_flush;
> > +	} pm;
> > +};
> > +
> > +/**
> > + * struct panthor_file - Panthor file
> > + */
> > +struct panthor_file {
> > +	/** @ptdev: Device attached to this file. */
> > +	struct panthor_device *ptdev;
> > +
> > +	/** @vms: VM pool attached to this file. */
> > +	struct panthor_vm_pool *vms;
> > +
> > +	/** @groups: Scheduling group pool attached to this file. */
> > +	struct panthor_group_pool *groups;
> > +};
> > +
> > +int panthor_device_init(struct panthor_device *ptdev);
> > +void panthor_device_unplug(struct panthor_device *ptdev);
> > +
> > +/**
> > + * panthor_device_schedule_reset() - Schedules a reset operation
> > + */
> > +static inline void panthor_device_schedule_reset(struct panthor_device *ptdev)
> > +{
> > +	if (atomic_read(&ptdev->pm.state) == PANTHOR_DEVICE_PM_STATE_ACTIVE &&  
> 
> As above - this is a racy check. Although it might be safe because of
> the cancel_work_sync() call in panthor_device_suspend(). But if we get
> rid of this check we don't need the atomic variable.

As mentioned above, I don't think cancel_work_sync() solves the race,
because the check might be done just before suspend, the thread paused
and resumed after cancel_work_sync() was called in the suspend path.

> 
> > +	    !atomic_cmpxchg(&ptdev->reset.pending, 0, 1))
> > +		queue_work(ptdev->reset.wq, &ptdev->reset.work);
> > +}
> > +
> > +/**
> > + * panthor_device_reset_is_pending() - Checks if a reset is pending.
> > + *
> > + * Return: true if a reset is pending, false otherwise.
> > + */
> > +static inline bool panthor_device_reset_is_pending(struct panthor_device *ptdev)
> > +{
> > +	return atomic_read(&ptdev->reset.pending) != 0;
> > +}
> > +
> > +int panthor_device_mmap_io(struct panthor_device *ptdev,
> > +			   struct vm_area_struct *vma);
> > +
> > +int panthor_device_resume(struct device *dev);
> > +int panthor_device_suspend(struct device *dev);
> > +
> > +enum drm_panthor_exception_type {
> > +	DRM_PANTHOR_EXCEPTION_OK = 0x00,
> > +	DRM_PANTHOR_EXCEPTION_TERMINATED = 0x04,
> > +	DRM_PANTHOR_EXCEPTION_KABOOM = 0x05,
> > +	DRM_PANTHOR_EXCEPTION_EUREKA = 0x06,
> > +	DRM_PANTHOR_EXCEPTION_ACTIVE = 0x08,
> > +	DRM_PANTHOR_EXCEPTION_CS_RES_TERM = 0x0f,
> > +	DRM_PANTHOR_EXCEPTION_MAX_NON_FAULT = 0x3f,
> > +	DRM_PANTHOR_EXCEPTION_CS_CONFIG_FAULT = 0x40,
> > +	DRM_PANTHOR_EXCEPTION_CS_ENDPOINT_FAULT = 0x44,
> > +	DRM_PANTHOR_EXCEPTION_CS_BUS_FAULT = 0x48,
> > +	DRM_PANTHOR_EXCEPTION_CS_INSTR_INVALID = 0x49,
> > +	DRM_PANTHOR_EXCEPTION_CS_CALL_STACK_OVERFLOW = 0x4a,
> > +	DRM_PANTHOR_EXCEPTION_CS_INHERIT_FAULT = 0x4b,
> > +	DRM_PANTHOR_EXCEPTION_INSTR_INVALID_PC = 0x50,
> > +	DRM_PANTHOR_EXCEPTION_INSTR_INVALID_ENC = 0x51,
> > +	DRM_PANTHOR_EXCEPTION_INSTR_BARRIER_FAULT = 0x55,
> > +	DRM_PANTHOR_EXCEPTION_DATA_INVALID_FAULT = 0x58,
> > +	DRM_PANTHOR_EXCEPTION_TILE_RANGE_FAULT = 0x59,
> > +	DRM_PANTHOR_EXCEPTION_ADDR_RANGE_FAULT = 0x5a,
> > +	DRM_PANTHOR_EXCEPTION_IMPRECISE_FAULT = 0x5b,
> > +	DRM_PANTHOR_EXCEPTION_OOM = 0x60,
> > +	DRM_PANTHOR_EXCEPTION_CSF_FW_INTERNAL_ERROR = 0x68,
> > +	DRM_PANTHOR_EXCEPTION_CSF_RES_EVICTION_TIMEOUT = 0x69,
> > +	DRM_PANTHOR_EXCEPTION_GPU_BUS_FAULT = 0x80,
> > +	DRM_PANTHOR_EXCEPTION_GPU_SHAREABILITY_FAULT = 0x88,
> > +	DRM_PANTHOR_EXCEPTION_SYS_SHAREABILITY_FAULT = 0x89,
> > +	DRM_PANTHOR_EXCEPTION_GPU_CACHEABILITY_FAULT = 0x8a,
> > +	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_0 = 0xc0,
> > +	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_1 = 0xc1,
> > +	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_2 = 0xc2,
> > +	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_3 = 0xc3,
> > +	DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_4 = 0xc4,
> > +	DRM_PANTHOR_EXCEPTION_PERM_FAULT_0 = 0xc8,
> > +	DRM_PANTHOR_EXCEPTION_PERM_FAULT_1 = 0xc9,
> > +	DRM_PANTHOR_EXCEPTION_PERM_FAULT_2 = 0xca,
> > +	DRM_PANTHOR_EXCEPTION_PERM_FAULT_3 = 0xcb,
> > +	DRM_PANTHOR_EXCEPTION_ACCESS_FLAG_1 = 0xd9,
> > +	DRM_PANTHOR_EXCEPTION_ACCESS_FLAG_2 = 0xda,
> > +	DRM_PANTHOR_EXCEPTION_ACCESS_FLAG_3 = 0xdb,
> > +	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_IN = 0xe0,
> > +	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT0 = 0xe4,
> > +	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT1 = 0xe5,
> > +	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT2 = 0xe6,
> > +	DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT3 = 0xe7,
> > +	DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_0 = 0xe8,
> > +	DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_1 = 0xe9,
> > +	DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_2 = 0xea,
> > +	DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_3 = 0xeb,
> > +};
> > +
> > +/**
> > + * panthor_exception_is_fault() - Checks if an exception is a fault.
> > + *
> > + * Return: true if the exception is a fault, false otherwise.
> > + */
> > +static inline bool
> > +panthor_exception_is_fault(u32 exception_code)
> > +{
> > +	return exception_code > DRM_PANTHOR_EXCEPTION_MAX_NON_FAULT;
> > +}
> > +
> > +const char *panthor_exception_name(struct panthor_device *ptdev,
> > +				   u32 exception_code);
> > +
> > +/**
> > + * PANTHOR_IRQ_HANDLER() - Define interrupt handlers and the interrupt
> > + * registration function.
> > + *
> > + * The boiler-plate to gracefully deal with shared interrupts is
> > + * auto-generated. All you have to do is call PANTHOR_IRQ_HANDLER()
> > + * just after you actual handler. The handler prototype is:  
> s/you/your/ or probably s/you/the/ since we don't expect people to be
> adding more ;)
> 
> > + *
> > + * void (*handler)(struct panthor_device *, u32 status);
> > + */
> > +#define PANTHOR_IRQ_HANDLER(__name, __reg_prefix, __handler)					\
> > +static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data)			\
> > +{												\
> > +	struct panthor_irq *pirq = data;							\
> > +	struct panthor_device *ptdev = pirq->ptdev;						\  
> 
> Maybe I'm missing something, but I was expecting a check here for if the
> irq has been suspended and to avoid the register reads if it was.

Thought the INT_MASK=0 + synchronize_irq() in panthor_xxx_irq_suspend()
would guarantee that the handler can't be called after
panthor_xxx_irq_suspend() was called.

> Otherwise I'm not entirely sure I follow what all this code is for.

Not entirely sure which code we're talking about. The reason we
don't use the default raw IRQ handler is because it doesn't work if the
irq line is shared. In that case, we need to mask all interrupts to
make sure other handlers on the same irq line don't get spammed with
our IRQs.

> 
> Steve
> 
> > +												\
> > +	if (!gpu_read(ptdev, __reg_prefix ## _INT_STAT))					\
> > +		return IRQ_NONE;								\
> > +												\
> > +	gpu_write(ptdev, __reg_prefix ## _INT_MASK, 0);						\
> > +	return IRQ_WAKE_THREAD;									\
> > +}												\
> > +												\
> > +static irqreturn_t panthor_ ## __name ## _irq_threaded_handler(int irq, void *data)		\
> > +{												\
> > +	struct panthor_irq *pirq = data;							\
> > +	struct panthor_device *ptdev = pirq->ptdev;						\
> > +	irqreturn_t ret = IRQ_NONE;								\
> > +												\
> > +	while (true) {										\
> > +		u32 status = gpu_read(ptdev, __reg_prefix ## _INT_RAWSTAT) & pirq->mask;	\
> > +												\
> > +		if (!status)									\
> > +			break;									\
> > +												\
> > +		gpu_write(ptdev, __reg_prefix ## _INT_CLEAR, status);				\
> > +												\
> > +		__handler(ptdev, status);							\
> > +		ret = IRQ_HANDLED;								\
> > +	}											\
> > +												\
> > +	if (!atomic_read(&pirq->suspended))							\
> > +		gpu_write(ptdev, __reg_prefix ## _INT_MASK, pirq->mask);			\
> > +												\
> > +	return ret;										\
> > +}												\
> > +												\
> > +static inline void panthor_ ## __name ## _irq_suspend(struct panthor_irq *pirq)			\
> > +{												\
> > +	int cookie;										\
> > +												\
> > +	atomic_set(&pirq->suspended, true);							\
> > +												\
> > +	if (drm_dev_enter(&pirq->ptdev->base, &cookie)) {					\
> > +		gpu_write(pirq->ptdev, __reg_prefix ## _INT_MASK, 0);				\
> > +		synchronize_irq(pirq->irq);							\
> > +		drm_dev_exit(cookie);								\
> > +	}											\
> > +												\
> > +	pirq->mask = 0;										\
> > +}												\
> > +												\
> > +static inline void panthor_ ## __name ## _irq_resume(struct panthor_irq *pirq, u32 mask)	\
> > +{												\
> > +	int cookie;										\
> > +												\
> > +	atomic_set(&pirq->suspended, false);							\
> > +	pirq->mask = mask;									\
> > +												\
> > +	if (drm_dev_enter(&pirq->ptdev->base, &cookie)) {					\
> > +		gpu_write(pirq->ptdev, __reg_prefix ## _INT_CLEAR, mask);			\
> > +		gpu_write(pirq->ptdev, __reg_prefix ## _INT_MASK, mask);			\
> > +		drm_dev_exit(cookie);								\
> > +	}											\
> > +}												\
> > +												\
> > +static int panthor_request_ ## __name ## _irq(struct panthor_device *ptdev,			\
> > +					      struct panthor_irq *pirq,				\
> > +					      int irq, u32 mask)				\
> > +{												\
> > +	pirq->ptdev = ptdev;									\
> > +	pirq->irq = irq;									\
> > +	panthor_ ## __name ## _irq_resume(pirq, mask);						\
> > +												\
> > +	return devm_request_threaded_irq(ptdev->base.dev, irq,					\
> > +					 panthor_ ## __name ## _irq_raw_handler,		\
> > +					 panthor_ ## __name ## _irq_threaded_handler,		\
> > +					 IRQF_SHARED, KBUILD_MODNAME "-" # __name,		\
> > +					 pirq);							\
> > +}
> > +
> > +extern struct workqueue_struct *panthor_cleanup_wq;
> > +
> > +#endif  
>