Add a way to dump perf counters through debugfs. The implementation is kept simple and has a number of limitations: * it's not designed for multi-user usage as the counter values are reset after each dump and there's no per-user context * only accessible to root users * no counters naming/position abstraction. Things are dumped in a raw format that has to be parsed by the user who has to know where the relevant values are and what they mean This implementation is intended to be used by mesa developers to help debug perf-related issues while we work on a more generic approach that would allow all GPU drivers to expose their counters in a consistent way. As a result, this debugfs interface is considered unstable and might be deprecated in the future. Signed-off-by: Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> --- Changes in v2: * Expose counters through debugfs and keep things simple for now (we'll work on a generic solution in parallel) --- drivers/gpu/drm/panfrost/Makefile | 3 +- drivers/gpu/drm/panfrost/panfrost_device.c | 9 + drivers/gpu/drm/panfrost/panfrost_device.h | 3 + drivers/gpu/drm/panfrost/panfrost_drv.c | 7 + drivers/gpu/drm/panfrost/panfrost_gpu.c | 7 + drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 339 ++++++++++++++++++++ drivers/gpu/drm/panfrost/panfrost_perfcnt.h | 15 + drivers/gpu/drm/panfrost/panfrost_regs.h | 19 ++ 8 files changed, 401 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h diff --git a/drivers/gpu/drm/panfrost/Makefile b/drivers/gpu/drm/panfrost/Makefile index 6de72d13c58f..ecf0864cb515 100644 --- a/drivers/gpu/drm/panfrost/Makefile +++ b/drivers/gpu/drm/panfrost/Makefile @@ -7,6 +7,7 @@ panfrost-y := \ panfrost_gem.o \ panfrost_gpu.o \ panfrost_job.o \ - panfrost_mmu.o + panfrost_mmu.o \ + panfrost_perfcnt.o obj-$(CONFIG_DRM_PANFROST) += panfrost.o diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c index 3b2bced1b015..b2395fc68a1f 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.c +++ b/drivers/gpu/drm/panfrost/panfrost_device.c @@ -14,6 +14,7 @@ #include "panfrost_gpu.h" #include "panfrost_job.h" #include "panfrost_mmu.h" +#include "panfrost_perfcnt.h" static int panfrost_reset_init(struct panfrost_device *pfdev) { @@ -149,7 +150,13 @@ int panfrost_device_init(struct panfrost_device *pfdev) pm_runtime_mark_last_busy(pfdev->dev); pm_runtime_put_autosuspend(pfdev->dev); + err = panfrost_perfcnt_init(pfdev); + if (err) + goto err_out5; + return 0; +err_out5: + panfrost_job_fini(pfdev); err_out4: panfrost_mmu_fini(pfdev); err_out3: @@ -165,6 +172,7 @@ int panfrost_device_init(struct panfrost_device *pfdev) void panfrost_device_fini(struct panfrost_device *pfdev) { + panfrost_perfcnt_fini(pfdev); panfrost_job_fini(pfdev); panfrost_mmu_fini(pfdev); panfrost_gpu_fini(pfdev); @@ -237,6 +245,7 @@ int panfrost_device_resume(struct device *dev) panfrost_mmu_enable(pfdev, 0); panfrost_job_enable_interrupts(pfdev); panfrost_devfreq_resume(pfdev); + panfrost_perfcnt_resume(pfdev); return 0; } diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index 56f452dfb490..628d704b76ba 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -14,6 +14,7 @@ struct panfrost_device; struct panfrost_mmu; struct panfrost_job_slot; struct panfrost_job; +struct panfrost_perfcnt; #define NUM_JOB_SLOTS 3 @@ -77,6 +78,8 @@ struct panfrost_device { struct panfrost_job *jobs[NUM_JOB_SLOTS]; struct list_head scheduled_jobs; + struct panfrost_perfcnt *perfcnt; + struct mutex sched_lock; struct mutex reset_lock; diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 94b0819ad50b..202397537eea 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -19,6 +19,7 @@ #include "panfrost_mmu.h" #include "panfrost_job.h" #include "panfrost_gpu.h" +#include "panfrost_perfcnt.h" static int panfrost_ioctl_get_param(struct drm_device *ddev, void *data, struct drm_file *file) { @@ -340,6 +341,11 @@ static const struct drm_ioctl_desc panfrost_drm_driver_ioctls[] = { DEFINE_DRM_GEM_SHMEM_FOPS(panfrost_drm_driver_fops); +static int panfrost_debugfs_init(struct drm_minor *minor) +{ + return panfrost_perfcnt_debugfs_init(minor); +} + static struct drm_driver panfrost_drm_driver = { .driver_features = DRIVER_RENDER | DRIVER_GEM | DRIVER_PRIME | DRIVER_SYNCOBJ, @@ -359,6 +365,7 @@ static struct drm_driver panfrost_drm_driver = { .prime_fd_to_handle = drm_gem_prime_fd_to_handle, .gem_prime_import_sg_table = panfrost_gem_prime_import_sg_table, .gem_prime_mmap = drm_gem_prime_mmap, + .debugfs_init = panfrost_debugfs_init, }; static int panfrost_probe(struct platform_device *pdev) diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c b/drivers/gpu/drm/panfrost/panfrost_gpu.c index 6e68a100291c..20ab333fc925 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gpu.c +++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c @@ -15,6 +15,7 @@ #include "panfrost_features.h" #include "panfrost_issues.h" #include "panfrost_gpu.h" +#include "panfrost_perfcnt.h" #include "panfrost_regs.h" static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data) @@ -40,6 +41,12 @@ static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data) gpu_write(pfdev, GPU_INT_MASK, 0); } + if (state & GPU_IRQ_PERFCNT_SAMPLE_COMPLETED) + panfrost_perfcnt_sample_done(pfdev); + + if (state & GPU_IRQ_CLEAN_CACHES_COMPLETED) + panfrost_perfcnt_clean_cache_done(pfdev); + gpu_write(pfdev, GPU_INT_CLEAR, state); return IRQ_HANDLED; diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c new file mode 100644 index 000000000000..8093783a5a26 --- /dev/null +++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2019 Collabora Ltd */ + +#include <drm/drm_file.h> +#include <drm/drm_gem_shmem_helper.h> +#include <drm/panfrost_drm.h> +#include <linux/completion.h> +#include <linux/debugfs.h> +#include <linux/iopoll.h> +#include <linux/pm_runtime.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include "panfrost_device.h" +#include "panfrost_features.h" +#include "panfrost_gem.h" +#include "panfrost_issues.h" +#include "panfrost_job.h" +#include "panfrost_mmu.h" +#include "panfrost_regs.h" + +#define COUNTERS_PER_BLOCK 64 +#define BYTES_PER_COUNTER 4 +#define BLOCKS_PER_COREGROUP 8 +#define V4_SHADERS_PER_COREGROUP 4 + +struct panfrost_perfcnt { + struct panfrost_gem_object *bo; + size_t bosize; + void *buf; + bool enabled; + struct mutex lock; + struct completion dump_comp; +}; + +void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev) +{ + complete(&pfdev->perfcnt->dump_comp); +} + +void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev) +{ + gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES); +} + +static void panfrost_perfcnt_setup(struct panfrost_device *pfdev) +{ + u32 cfg; + + /* + * Always use address space 0 for now. + * FIXME: this needs to be updated when we start using different + * address space. + */ + cfg = GPU_PERFCNT_CFG_AS(0); + if (panfrost_model_cmp(pfdev, 0x1000) >= 0) + cfg |= GPU_PERFCNT_CFG_SETSEL(1); + + gpu_write(pfdev, GPU_PERFCNT_CFG, + cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF)); + + if (!pfdev->perfcnt->enabled) + return; + + gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0xffffffff); + gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0xffffffff); + gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0xffffffff); + + /* + * Due to PRLAM-8186 we need to disable the Tiler before we enable HW + * counters. + */ + if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186)) + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0); + else + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff); + + gpu_write(pfdev, GPU_PERFCNT_CFG, + cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL)); + + if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186)) + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff); +} + +static int panfrost_perfcnt_dump(struct panfrost_device *pfdev) +{ + u64 gpuva; + int ret; + + reinit_completion(&pfdev->perfcnt->dump_comp); + gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT; + gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva); + gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32); + gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE); + ret = wait_for_completion_interruptible_timeout(&pfdev->perfcnt->dump_comp, + msecs_to_jiffies(1000)); + if (!ret) + ret = -ETIMEDOUT; + else if (ret > 0) + ret = 0; + + return ret; +} + +void panfrost_perfcnt_resume(struct panfrost_device *pfdev) +{ + if (pfdev->perfcnt) + panfrost_perfcnt_setup(pfdev); +} + +static ssize_t panfrost_perfcnt_enable_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct panfrost_device *pfdev = file->private_data; + ssize_t ret; + + mutex_lock(&pfdev->perfcnt->lock); + ret = simple_read_from_buffer(user_buf, count, ppos, + pfdev->perfcnt->enabled ? "Y\n" : "N\n", + 2); + mutex_unlock(&pfdev->perfcnt->lock); + + return ret; +} + +static ssize_t panfrost_perfcnt_enable_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct panfrost_device *pfdev = file->private_data; + bool enable; + int ret; + + ret = kstrtobool_from_user(user_buf, count, &enable); + if (ret) + return ret; + + ret = pm_runtime_get_sync(pfdev->dev); + if (ret < 0) + return ret; + + mutex_lock(&pfdev->perfcnt->lock); + if (enable != pfdev->perfcnt->enabled) { + pfdev->perfcnt->enabled = enable; + panfrost_perfcnt_setup(pfdev); + } + mutex_unlock(&pfdev->perfcnt->lock); + + pm_runtime_mark_last_busy(pfdev->dev); + pm_runtime_put_autosuspend(pfdev->dev); + + return count; +} + +static const struct file_operations panfrost_perfcnt_enable_fops = { + .read = panfrost_perfcnt_enable_read, + .write = panfrost_perfcnt_enable_write, + .open = simple_open, + .llseek = default_llseek, +}; + +static ssize_t panfrost_perfcnt_dump_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct panfrost_device *pfdev = file->private_data; + ssize_t ret; + + ret = pm_runtime_get_sync(pfdev->dev); + if (ret < 0) + return ret; + + mutex_lock(&pfdev->perfcnt->lock); + if (!pfdev->perfcnt->enabled) { + ret = -EINVAL; + goto out; + } + + ret = panfrost_perfcnt_dump(pfdev); + if (ret) + goto out; + + ret = simple_read_from_buffer(user_buf, count, ppos, + pfdev->perfcnt->buf, + pfdev->perfcnt->bosize); + +out: + mutex_unlock(&pfdev->perfcnt->lock); + pm_runtime_mark_last_busy(pfdev->dev); + pm_runtime_put_autosuspend(pfdev->dev); + + return ret; +} + +static const struct file_operations panfrost_perfcnt_dump_fops = { + .read = panfrost_perfcnt_dump_read, + .open = simple_open, + .llseek = default_llseek, +}; + +int panfrost_perfcnt_debugfs_init(struct drm_minor *minor) +{ + struct panfrost_device *pfdev = to_panfrost_device(minor->dev); + struct dentry *file, *dir; + + dir = debugfs_create_dir("perfcnt", minor->debugfs_root); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + file = debugfs_create_file("dump", 0400, dir, pfdev, + &panfrost_perfcnt_dump_fops); + if (IS_ERR(file)) + return PTR_ERR(file); + + file = debugfs_create_file("enable", 0600, dir, pfdev, + &panfrost_perfcnt_enable_fops); + if (IS_ERR(file)) + return PTR_ERR(file); + + return 0; +} + +int panfrost_perfcnt_init(struct panfrost_device *pfdev) +{ + struct panfrost_perfcnt *perfcnt; + struct drm_gem_shmem_object *bo; + size_t size; + u32 status; + int ret; + + if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) { + unsigned int ncoregroups; + + ncoregroups = hweight64(pfdev->features.l2_present); + size = ncoregroups * BLOCKS_PER_COREGROUP * + COUNTERS_PER_BLOCK * BYTES_PER_COUNTER; + } else { + unsigned int nl2c, ncores; + + /* + * TODO: define a macro to extract the number of l2 caches from + * mem_features. + */ + nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1; + + /* + * The ARM driver is grouping cores per core group and then + * only using the number of cores in group 0 to calculate the + * size. Not sure why this is done like that, but I guess + * shader_present will only show cores in the first group + * anyway. + */ + ncores = hweight64(pfdev->features.shader_present); + + /* + * There's always one JM and one Tiler block, hence the '+ 2' + * here. + */ + size = (nl2c + ncores + 2) * + COUNTERS_PER_BLOCK * BYTES_PER_COUNTER; + } + + perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL); + if (!perfcnt) + return -ENOMEM; + + bo = drm_gem_shmem_create(pfdev->ddev, size); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + perfcnt->bo = to_panfrost_bo(&bo->base); + perfcnt->bosize = size; + + /* + * We always use the same buffer, so let's map it once and keep it + * mapped until the driver is unloaded. This might be a problem if + * we start using different AS and the perfcnt BO is not mapped at + * the same GPU virtual address. + */ + ret = panfrost_mmu_map(perfcnt->bo); + if (ret) + goto err_put_bo; + + /* Disable everything. */ + gpu_write(pfdev, GPU_PERFCNT_CFG, + GPU_PERFCNT_CFG_AS(0) | + GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF) | + (panfrost_model_cmp(pfdev, 0x1000) >= 0 ? + GPU_PERFCNT_CFG_SETSEL(1) : 0)); + gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0); + gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0); + gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0); + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0); + + perfcnt->buf = drm_gem_vmap(&bo->base); + if (IS_ERR(perfcnt->buf)) { + ret = PTR_ERR(perfcnt->buf); + goto err_put_bo; + } + + init_completion(&perfcnt->dump_comp); + mutex_init(&perfcnt->lock); + pfdev->perfcnt = perfcnt; + + /* + * Invalidate the cache and clear the counters to start from a fresh + * state. + */ + gpu_write(pfdev, GPU_INT_MASK, 0); + gpu_write(pfdev, GPU_INT_CLEAR, GPU_IRQ_CLEAN_CACHES_COMPLETED); + + gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_CLEAR); + gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_INV_CACHES); + ret = readl_relaxed_poll_timeout(pfdev->iomem + GPU_INT_RAWSTAT, + status, + status & + GPU_IRQ_CLEAN_CACHES_COMPLETED, + 100, 10000); + if (ret) + goto err_gem_vunmap; + + gpu_write(pfdev, GPU_INT_MASK, GPU_IRQ_MASK_ALL); + + return 0; + +err_gem_vunmap: + drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf); + +err_put_bo: + drm_gem_object_put_unlocked(&bo->base); + return ret; +} + +void panfrost_perfcnt_fini(struct panfrost_device *pfdev) +{ + drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf); + drm_gem_object_put_unlocked(&pfdev->perfcnt->bo->base.base); +} diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.h b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h new file mode 100644 index 000000000000..b21745398178 --- /dev/null +++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2019 Collabora Ltd */ +#ifndef __PANFROST_PERFCNT_H__ +#define __PANFROST_PERFCNT_H__ + +#include "panfrost_device.h" + +void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev); +void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev); +int panfrost_perfcnt_init(struct panfrost_device *pfdev); +void panfrost_perfcnt_fini(struct panfrost_device *pfdev); +void panfrost_perfcnt_resume(struct panfrost_device *pfdev); +int panfrost_perfcnt_debugfs_init(struct drm_minor *minor); + +#endif diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h b/drivers/gpu/drm/panfrost/panfrost_regs.h index 42d08860fd76..ea38ac60581c 100644 --- a/drivers/gpu/drm/panfrost/panfrost_regs.h +++ b/drivers/gpu/drm/panfrost/panfrost_regs.h @@ -44,12 +44,31 @@ GPU_IRQ_MULTIPLE_FAULT) #define GPU_CMD 0x30 #define GPU_CMD_SOFT_RESET 0x01 +#define GPU_CMD_PERFCNT_CLEAR 0x03 +#define GPU_CMD_PERFCNT_SAMPLE 0x04 +#define GPU_CMD_CLEAN_CACHES 0x07 +#define GPU_CMD_CLEAN_INV_CACHES 0x08 #define GPU_STATUS 0x34 +#define GPU_STATUS_PRFCNT_ACTIVE BIT(2) #define GPU_LATEST_FLUSH_ID 0x38 #define GPU_FAULT_STATUS 0x3C #define GPU_FAULT_ADDRESS_LO 0x40 #define GPU_FAULT_ADDRESS_HI 0x44 +#define GPU_PERFCNT_BASE_LO 0x60 +#define GPU_PERFCNT_BASE_HI 0x64 +#define GPU_PERFCNT_CFG 0x68 +#define GPU_PERFCNT_CFG_MODE(x) (x) +#define GPU_PERFCNT_CFG_MODE_OFF 0 +#define GPU_PERFCNT_CFG_MODE_MANUAL 1 +#define GPU_PERFCNT_CFG_MODE_TILE 2 +#define GPU_PERFCNT_CFG_AS(x) ((x) << 4) +#define GPU_PERFCNT_CFG_SETSEL(x) ((x) << 8) +#define GPU_PRFCNT_JM_EN 0x6c +#define GPU_PRFCNT_SHADER_EN 0x70 +#define GPU_PRFCNT_TILER_EN 0x74 +#define GPU_PRFCNT_MMU_L2_EN 0x7c + #define GPU_THREAD_MAX_THREADS 0x0A0 /* (RO) Maximum number of threads per core */ #define GPU_THREAD_MAX_WORKGROUP_SIZE 0x0A4 /* (RO) Maximum workgroup size */ #define GPU_THREAD_MAX_BARRIER_SIZE 0x0A8 /* (RO) Maximum threads waiting at a barrier */ -- 2.20.1 _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel