On 14/05/2019 14:31, Rob Herring wrote: > On Tue, May 14, 2019 at 5:48 AM Boris Brezillon > <boris.brezillon@xxxxxxxxxxxxx> wrote: >> >> Add a way to dump perf counters through debugfs. The implementation is >> kept simple and has a number of limitations: >> >> * it's not designed for multi-user usage as the counter values are >> reset after each dump and there's no per-user context >> * only accessible to root users >> * no counters naming/position abstraction. Things are dumped in a raw >> format that has to be parsed by the user who has to know where the >> relevant values are and what they mean >> >> This implementation is intended to be used by mesa developers to help >> debug perf-related issues while we work on a more generic approach that >> would allow all GPU drivers to expose their counters in a consistent >> way. As a result, this debugfs interface is considered unstable and >> might be deprecated in the future. >> >> Signed-off-by: Boris Brezillon <boris.brezillon@xxxxxxxxxxxxx> >> --- >> Changes in v2: >> * Expose counters through debugfs and keep things simple for now (we'll >> work on a generic solution in parallel) >> --- >> drivers/gpu/drm/panfrost/Makefile | 3 +- >> drivers/gpu/drm/panfrost/panfrost_device.c | 9 + >> drivers/gpu/drm/panfrost/panfrost_device.h | 3 + >> drivers/gpu/drm/panfrost/panfrost_drv.c | 7 + >> drivers/gpu/drm/panfrost/panfrost_gpu.c | 7 + >> drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 339 ++++++++++++++++++++ >> drivers/gpu/drm/panfrost/panfrost_perfcnt.h | 15 + >> drivers/gpu/drm/panfrost/panfrost_regs.h | 19 ++ >> 8 files changed, 401 insertions(+), 1 deletion(-) >> create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c >> create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h >> [...] >> new file mode 100644 >> index 000000000000..8093783a5a26 >> --- /dev/null >> +++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c >> @@ -0,0 +1,339 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* Copyright 2019 Collabora Ltd */ >> + >> +#include <drm/drm_file.h> >> +#include <drm/drm_gem_shmem_helper.h> >> +#include <drm/panfrost_drm.h> >> +#include <linux/completion.h> >> +#include <linux/debugfs.h> >> +#include <linux/iopoll.h> >> +#include <linux/pm_runtime.h> >> +#include <linux/slab.h> >> +#include <linux/uaccess.h> >> + >> +#include "panfrost_device.h" >> +#include "panfrost_features.h" >> +#include "panfrost_gem.h" >> +#include "panfrost_issues.h" >> +#include "panfrost_job.h" >> +#include "panfrost_mmu.h" >> +#include "panfrost_regs.h" >> + >> +#define COUNTERS_PER_BLOCK 64 >> +#define BYTES_PER_COUNTER 4 >> +#define BLOCKS_PER_COREGROUP 8 >> +#define V4_SHADERS_PER_COREGROUP 4 >> + >> +struct panfrost_perfcnt { >> + struct panfrost_gem_object *bo; >> + size_t bosize; >> + void *buf; >> + bool enabled; >> + struct mutex lock; >> + struct completion dump_comp; >> +}; >> + >> +void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev) >> +{ >> + complete(&pfdev->perfcnt->dump_comp); >> +} >> + >> +void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev) >> +{ >> + gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES); >> +} >> + >> +static void panfrost_perfcnt_setup(struct panfrost_device *pfdev) >> +{ >> + u32 cfg; >> + >> + /* >> + * Always use address space 0 for now. >> + * FIXME: this needs to be updated when we start using different >> + * address space. >> + */ >> + cfg = GPU_PERFCNT_CFG_AS(0); >> + if (panfrost_model_cmp(pfdev, 0x1000) >= 0) > > You've got a couple of these. Perhaps we should add either a > model_is_bifrost() helper or an is_bifrost variable to use. > >> + cfg |= GPU_PERFCNT_CFG_SETSEL(1); mali_kbase only sets this if CONFIG_MALI_PRFCNT_SET_SECONDARY is defined - i.e. this selects a different selection of counters. At at the very least this should be an optional flag that can be set, but I suspect the primary set are the ones you are interested in. >> + >> + gpu_write(pfdev, GPU_PERFCNT_CFG, >> + cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF)); >> + >> + if (!pfdev->perfcnt->enabled) >> + return; >> + >> + gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0xffffffff); >> + gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0xffffffff); >> + gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0xffffffff); >> + >> + /* >> + * Due to PRLAM-8186 we need to disable the Tiler before we enable HW >> + * counters. > > Seems like you could just always apply the work-around? It doesn't > look like it would be much different. Technically the workaround causes the driver to perform the operation in the wrong order below (write TILER_EN when the mode is MANUAL) - this is a workaround for the hardware issue (and therefore works on that hardware). But I wouldn't like to say it works on other hardware which doesn't have the issue. >> + */ >> + if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186)) >> + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0); >> + else >> + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff); >> + >> + gpu_write(pfdev, GPU_PERFCNT_CFG, >> + cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL)); >> + >> + if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186)) >> + gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff); >> +} >> + >> +static int panfrost_perfcnt_dump(struct panfrost_device *pfdev) >> +{ >> + u64 gpuva; >> + int ret; >> + >> + reinit_completion(&pfdev->perfcnt->dump_comp); >> + gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT; >> + gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva); >> + gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32); >> + gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE); >> + ret = wait_for_completion_interruptible_timeout(&pfdev->perfcnt->dump_comp, >> + msecs_to_jiffies(1000)); >> + if (!ret) >> + ret = -ETIMEDOUT; >> + else if (ret > 0) >> + ret = 0; >> + >> + return ret; >> +} >> + >> +void panfrost_perfcnt_resume(struct panfrost_device *pfdev) > > No suspend handling needed? > >> +{ >> + if (pfdev->perfcnt) >> + panfrost_perfcnt_setup(pfdev); >> +} >> + >> +static ssize_t panfrost_perfcnt_enable_read(struct file *file, >> + char __user *user_buf, >> + size_t count, loff_t *ppos) >> +{ >> + struct panfrost_device *pfdev = file->private_data; >> + ssize_t ret; >> + >> + mutex_lock(&pfdev->perfcnt->lock); >> + ret = simple_read_from_buffer(user_buf, count, ppos, >> + pfdev->perfcnt->enabled ? "Y\n" : "N\n", >> + 2); >> + mutex_unlock(&pfdev->perfcnt->lock); >> + >> + return ret; >> +} >> + >> +static ssize_t panfrost_perfcnt_enable_write(struct file *file, >> + const char __user *user_buf, >> + size_t count, loff_t *ppos) >> +{ >> + struct panfrost_device *pfdev = file->private_data; >> + bool enable; >> + int ret; >> + >> + ret = kstrtobool_from_user(user_buf, count, &enable); >> + if (ret) >> + return ret; >> + >> + ret = pm_runtime_get_sync(pfdev->dev); >> + if (ret < 0) >> + return ret; >> + >> + mutex_lock(&pfdev->perfcnt->lock); >> + if (enable != pfdev->perfcnt->enabled) { >> + pfdev->perfcnt->enabled = enable; >> + panfrost_perfcnt_setup(pfdev); >> + } >> + mutex_unlock(&pfdev->perfcnt->lock); >> + >> + pm_runtime_mark_last_busy(pfdev->dev); >> + pm_runtime_put_autosuspend(pfdev->dev); >> + >> + return count; >> +} >> + >> +static const struct file_operations panfrost_perfcnt_enable_fops = { >> + .read = panfrost_perfcnt_enable_read, >> + .write = panfrost_perfcnt_enable_write, >> + .open = simple_open, >> + .llseek = default_llseek, >> +}; >> + >> +static ssize_t panfrost_perfcnt_dump_read(struct file *file, >> + char __user *user_buf, >> + size_t count, loff_t *ppos) >> +{ >> + struct panfrost_device *pfdev = file->private_data; >> + ssize_t ret; >> + >> + ret = pm_runtime_get_sync(pfdev->dev); >> + if (ret < 0) >> + return ret; >> + >> + mutex_lock(&pfdev->perfcnt->lock); >> + if (!pfdev->perfcnt->enabled) { >> + ret = -EINVAL; >> + goto out; >> + } >> + >> + ret = panfrost_perfcnt_dump(pfdev); >> + if (ret) >> + goto out; >> + >> + ret = simple_read_from_buffer(user_buf, count, ppos, >> + pfdev->perfcnt->buf, >> + pfdev->perfcnt->bosize); >> + >> +out: >> + mutex_unlock(&pfdev->perfcnt->lock); >> + pm_runtime_mark_last_busy(pfdev->dev); >> + pm_runtime_put_autosuspend(pfdev->dev); >> + >> + return ret; >> +} >> + >> +static const struct file_operations panfrost_perfcnt_dump_fops = { >> + .read = panfrost_perfcnt_dump_read, >> + .open = simple_open, >> + .llseek = default_llseek, >> +}; >> + >> +int panfrost_perfcnt_debugfs_init(struct drm_minor *minor) >> +{ >> + struct panfrost_device *pfdev = to_panfrost_device(minor->dev); >> + struct dentry *file, *dir; >> + >> + dir = debugfs_create_dir("perfcnt", minor->debugfs_root); >> + if (IS_ERR(dir)) >> + return PTR_ERR(dir); >> + >> + file = debugfs_create_file("dump", 0400, dir, pfdev, >> + &panfrost_perfcnt_dump_fops); >> + if (IS_ERR(file)) >> + return PTR_ERR(file); >> + >> + file = debugfs_create_file("enable", 0600, dir, pfdev, >> + &panfrost_perfcnt_enable_fops); >> + if (IS_ERR(file)) >> + return PTR_ERR(file); >> + >> + return 0; >> +} >> + >> +int panfrost_perfcnt_init(struct panfrost_device *pfdev) >> +{ >> + struct panfrost_perfcnt *perfcnt; >> + struct drm_gem_shmem_object *bo; >> + size_t size; >> + u32 status; >> + int ret; >> + >> + if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) { >> + unsigned int ncoregroups; >> + >> + ncoregroups = hweight64(pfdev->features.l2_present); >> + size = ncoregroups * BLOCKS_PER_COREGROUP * >> + COUNTERS_PER_BLOCK * BYTES_PER_COUNTER; >> + } else { >> + unsigned int nl2c, ncores; >> + >> + /* >> + * TODO: define a macro to extract the number of l2 caches from >> + * mem_features. >> + */ >> + nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1; >> + >> + /* >> + * The ARM driver is grouping cores per core group and then >> + * only using the number of cores in group 0 to calculate the >> + * size. Not sure why this is done like that, but I guess >> + * shader_present will only show cores in the first group >> + * anyway. >> + */ I'm not sure I understand this comment. I'm not sure which version of the driver you are looking at, but shader_present should contain all the cores (in every core group). The kbase driver (in panfrost's git tree[1]), contains: base_gpu_props *props = &kbdev->gpu_props.props; u32 nr_l2 = props->l2_props.num_l2_slices; u64 core_mask = props->coherency_info.group[0].core_mask; u32 nr_blocks = fls64(core_mask); /* JM and tiler counter blocks are always present */ dump_size = (2 + nr_l2 + nr_blocks) * NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT; [1] https://gitlab.freedesktop.org/panfrost/mali_kbase/blob/master/driver/product/kernel/drivers/gpu/arm/midgard/mali_kbase_vinstr.c i.e. the number of cores + number of L2 caches (plus 2 for JM/tiler) multiplied by the block size. Also another difference with the below is that fls64 and hweight64 don't return the same numbers. If the core mask is non-contiguous this will return different values. >> + ncores = hweight64(pfdev->features.shader_present); >> + >> + /* >> + * There's always one JM and one Tiler block, hence the '+ 2' >> + * here. >> + */ >> + size = (nl2c + ncores + 2) * >> + COUNTERS_PER_BLOCK * BYTES_PER_COUNTER; >> + } >> + >> + perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL); >> + if (!perfcnt) >> + return -ENOMEM; [...] Steve _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel