On 06/08, Juan A. Suarez Romero wrote: > The V3D engine has several hardware performance counters that can of > interest for userspace performance analysis tools. > > This exposes new ioctls to create and destroy performance monitor > objects, as well as to query the counter values. > > Each created performance monitor object has an ID that can be attached > to CL/CSD submissions, so the driver enables the requested counters when > the job is submitted, and updates the performance monitor values when > the job is done. > > It is up to the user to ensure all the jobs have been finished before > getting the performance monitor values. It is also up to the user to > properly synchronize BCL jobs when submitting jobs with different > performance monitors attached. > > Cc: Daniel Vetter <daniel@xxxxxxxx> > Cc: David Airlie <airlied@xxxxxxxx> > Cc: Emma Anholt <emma@xxxxxxxxxx> > To: dri-devel@xxxxxxxxxxxxxxxxxxxxx > Signed-off-by: Juan A. Suarez Romero <jasuarez@xxxxxxxxxx> Hi Juan, I've checked it (+ mesa MR) on glxgears, and lgtm. + some basic tests from igt, by default. Acked-by: Melissa Wen <mwen@xxxxxxxxxx> > --- > drivers/gpu/drm/v3d/Makefile | 1 + > drivers/gpu/drm/v3d/v3d_drv.c | 8 ++ > drivers/gpu/drm/v3d/v3d_drv.h | 63 +++++++++ > drivers/gpu/drm/v3d/v3d_gem.c | 31 +++++ > drivers/gpu/drm/v3d/v3d_perfmon.c | 213 ++++++++++++++++++++++++++++++ > drivers/gpu/drm/v3d/v3d_regs.h | 2 + > drivers/gpu/drm/v3d/v3d_sched.c | 16 +++ > include/uapi/drm/v3d_drm.h | 136 +++++++++++++++++++ > 8 files changed, 470 insertions(+) > create mode 100644 drivers/gpu/drm/v3d/v3d_perfmon.c > > diff --git a/drivers/gpu/drm/v3d/Makefile b/drivers/gpu/drm/v3d/Makefile > index db4cfc155821..e8b314137020 100644 > --- a/drivers/gpu/drm/v3d/Makefile > +++ b/drivers/gpu/drm/v3d/Makefile > @@ -9,6 +9,7 @@ v3d-y := \ > v3d_gem.o \ > v3d_irq.o \ > v3d_mmu.o \ > + v3d_perfmon.o \ > v3d_trace_points.o \ > v3d_sched.o > > diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c > index 99e22beea90b..9403c3b36aca 100644 > --- a/drivers/gpu/drm/v3d/v3d_drv.c > +++ b/drivers/gpu/drm/v3d/v3d_drv.c > @@ -94,6 +94,9 @@ static int v3d_get_param_ioctl(struct drm_device *dev, void *data, > case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH: > args->value = 1; > return 0; > + case DRM_V3D_PARAM_SUPPORTS_PERFMON: > + args->value = (v3d->ver >= 40); > + return 0; > default: > DRM_DEBUG("Unknown parameter %d\n", args->param); > return -EINVAL; > @@ -121,6 +124,7 @@ v3d_open(struct drm_device *dev, struct drm_file *file) > 1, NULL); > } > > + v3d_perfmon_open_file(v3d_priv); > file->driver_priv = v3d_priv; > > return 0; > @@ -136,6 +140,7 @@ v3d_postclose(struct drm_device *dev, struct drm_file *file) > drm_sched_entity_destroy(&v3d_priv->sched_entity[q]); > } > > + v3d_perfmon_close_file(v3d_priv); > kfree(v3d_priv); > } > > @@ -156,6 +161,9 @@ static const struct drm_ioctl_desc v3d_drm_ioctls[] = { > DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH), > DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH), > + DRM_IOCTL_DEF_DRV(V3D_PERFMON_CREATE, v3d_perfmon_create_ioctl, DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(V3D_PERFMON_DESTROY, v3d_perfmon_destroy_ioctl, DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(V3D_PERFMON_GET_VALUES, v3d_perfmon_get_values_ioctl, DRM_RENDER_ALLOW), > }; > > static const struct drm_driver v3d_drm_driver = { > diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h > index 8a390738d65b..270134779073 100644 > --- a/drivers/gpu/drm/v3d/v3d_drv.h > +++ b/drivers/gpu/drm/v3d/v3d_drv.h > @@ -37,6 +37,40 @@ struct v3d_queue_state { > u64 emit_seqno; > }; > > +/* Performance monitor object. The perform lifetime is controlled by userspace > + * using perfmon related ioctls. A perfmon can be attached to a submit_cl > + * request, and when this is the case, HW perf counters will be activated just > + * before the submit_cl is submitted to the GPU and disabled when the job is > + * done. This way, only events related to a specific job will be counted. > + */ > +struct v3d_perfmon { > + /* Tracks the number of users of the perfmon, when this counter reaches > + * zero the perfmon is destroyed. > + */ > + refcount_t refcnt; > + > + /* Protects perfmon stop, as it can be invoked from multiple places. */ > + struct mutex lock; > + > + /* Number of counters activated in this perfmon instance > + * (should be less than DRM_V3D_MAX_PERF_COUNTERS). > + */ > + u8 ncounters; > + > + /* Events counted by the HW perf counters. */ > + u8 counters[DRM_V3D_MAX_PERF_COUNTERS]; > + > + /* Storage for counter values. Counters are incremented by the > + * HW perf counter values every time the perfmon is attached > + * to a GPU job. This way, perfmon users don't have to > + * retrieve the results after each job if they want to track > + * events covering several submissions. Note that counter > + * values can't be reset, but you can fake a reset by > + * destroying the perfmon and creating a new one. > + */ > + u64 values[]; > +}; > + > struct v3d_dev { > struct drm_device drm; > > @@ -89,6 +123,9 @@ struct v3d_dev { > */ > spinlock_t job_lock; > > + /* Used to track the active perfmon if any. */ > + struct v3d_perfmon *active_perfmon; > + > /* Protects bo_stats */ > struct mutex bo_lock; > > @@ -133,6 +170,11 @@ v3d_has_csd(struct v3d_dev *v3d) > struct v3d_file_priv { > struct v3d_dev *v3d; > > + struct { > + struct idr idr; > + struct mutex lock; > + } perfmon; > + > struct drm_sched_entity sched_entity[V3D_MAX_QUEUES]; > }; > > @@ -205,6 +247,11 @@ struct v3d_job { > */ > struct dma_fence *done_fence; > > + /* Pointer to a performance monitor object if the user requested it, > + * NULL otherwise. > + */ > + struct v3d_perfmon *perfmon; > + > /* Callback for the freeing of the job on refcount going to 0. */ > void (*free)(struct kref *ref); > }; > @@ -353,3 +400,19 @@ void v3d_mmu_remove_ptes(struct v3d_bo *bo); > /* v3d_sched.c */ > int v3d_sched_init(struct v3d_dev *v3d); > void v3d_sched_fini(struct v3d_dev *v3d); > + > +/* v3d_perfmon.c */ > +void v3d_perfmon_get(struct v3d_perfmon *perfmon); > +void v3d_perfmon_put(struct v3d_perfmon *perfmon); > +void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon); > +void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon, > + bool capture); > +struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id); > +void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv); > +void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv); > +int v3d_perfmon_create_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv); > +int v3d_perfmon_destroy_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv); > +int v3d_perfmon_get_values_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv); > diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c > index 4eb354226972..5689da118197 100644 > --- a/drivers/gpu/drm/v3d/v3d_gem.c > +++ b/drivers/gpu/drm/v3d/v3d_gem.c > @@ -126,6 +126,8 @@ v3d_reset(struct v3d_dev *v3d) > v3d_mmu_set_page_table(v3d); > v3d_irq_reset(v3d); > > + v3d_perfmon_stop(v3d, v3d->active_perfmon, false); > + > trace_v3d_reset_end(dev); > } > > @@ -375,6 +377,9 @@ v3d_job_free(struct kref *ref) > pm_runtime_mark_last_busy(job->v3d->drm.dev); > pm_runtime_put_autosuspend(job->v3d->drm.dev); > > + if (job->perfmon) > + v3d_perfmon_put(job->perfmon); > + > kfree(job); > } > > @@ -539,6 +544,9 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data, > > trace_v3d_submit_cl_ioctl(&v3d->drm, args->rcl_start, args->rcl_end); > > + if (args->pad != 0) > + return -EINVAL; > + > if (args->flags != 0 && > args->flags != DRM_V3D_SUBMIT_CL_FLUSH_CACHE) { > DRM_INFO("invalid flags: %d\n", args->flags); > @@ -611,8 +619,20 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data, > if (ret) > goto fail; > > + if (args->perfmon_id) { > + render->base.perfmon = v3d_perfmon_find(v3d_priv, > + args->perfmon_id); > + > + if (!render->base.perfmon) { > + ret = -ENOENT; > + goto fail; > + } > + } > + > mutex_lock(&v3d->sched_lock); > if (bin) { > + bin->base.perfmon = render->base.perfmon; > + v3d_perfmon_get(bin->base.perfmon); > ret = v3d_push_job(v3d_priv, &bin->base, V3D_BIN); > if (ret) > goto fail_unreserve; > @@ -633,6 +653,8 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data, > ret = drm_gem_fence_array_add(&clean_job->deps, render_fence); > if (ret) > goto fail_unreserve; > + clean_job->perfmon = render->base.perfmon; > + v3d_perfmon_get(clean_job->perfmon); > ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN); > if (ret) > goto fail_unreserve; > @@ -827,6 +849,15 @@ v3d_submit_csd_ioctl(struct drm_device *dev, void *data, > if (ret) > goto fail; > > + if (args->perfmon_id) { > + job->base.perfmon = v3d_perfmon_find(v3d_priv, > + args->perfmon_id); > + if (!job->base.perfmon) { > + ret = -ENOENT; > + goto fail; > + } > + } > + > mutex_lock(&v3d->sched_lock); > ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD); > if (ret) > diff --git a/drivers/gpu/drm/v3d/v3d_perfmon.c b/drivers/gpu/drm/v3d/v3d_perfmon.c > new file mode 100644 > index 000000000000..0288ef063513 > --- /dev/null > +++ b/drivers/gpu/drm/v3d/v3d_perfmon.c > @@ -0,0 +1,213 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2021 Raspberry Pi > + */ > + > +#include "v3d_drv.h" > +#include "v3d_regs.h" > + > +#define V3D_PERFMONID_MIN 1 > +#define V3D_PERFMONID_MAX U32_MAX > + > +void v3d_perfmon_get(struct v3d_perfmon *perfmon) > +{ > + if (perfmon) > + refcount_inc(&perfmon->refcnt); > +} > + > +void v3d_perfmon_put(struct v3d_perfmon *perfmon) > +{ > + if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) > + kfree(perfmon); > +} > + > +void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon) > +{ > + unsigned int i; > + u32 mask; > + u8 ncounters = perfmon->ncounters; > + > + if (WARN_ON_ONCE(!perfmon || v3d->active_perfmon)) > + return; > + > + mask = GENMASK(ncounters - 1, 0); > + > + for (i = 0; i < ncounters; i++) { > + u32 source = i / 4; > + u32 channel = V3D_SET_FIELD(perfmon->counters[i], V3D_PCTR_S0); > + > + i++; > + channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0, > + V3D_PCTR_S1); > + i++; > + channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0, > + V3D_PCTR_S2); > + i++; > + channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0, > + V3D_PCTR_S3); > + V3D_CORE_WRITE(0, V3D_V4_PCTR_0_SRC_X(source), channel); > + } > + > + V3D_CORE_WRITE(0, V3D_V4_PCTR_0_CLR, mask); > + V3D_CORE_WRITE(0, V3D_PCTR_0_OVERFLOW, mask); > + V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, mask); > + > + v3d->active_perfmon = perfmon; > +} > + > +void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon, > + bool capture) > +{ > + unsigned int i; > + > + if (!perfmon || !v3d->active_perfmon) > + return; > + > + mutex_lock(&perfmon->lock); > + if (perfmon != v3d->active_perfmon) { > + mutex_unlock(&perfmon->lock); > + return; > + } > + > + if (capture) > + for (i = 0; i < perfmon->ncounters; i++) > + perfmon->values[i] += V3D_CORE_READ(0, V3D_PCTR_0_PCTRX(i)); > + > + V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, 0); > + > + v3d->active_perfmon = NULL; > + mutex_unlock(&perfmon->lock); > +} > + > +struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id) > +{ > + struct v3d_perfmon *perfmon; > + > + mutex_lock(&v3d_priv->perfmon.lock); > + perfmon = idr_find(&v3d_priv->perfmon.idr, id); > + v3d_perfmon_get(perfmon); > + mutex_unlock(&v3d_priv->perfmon.lock); > + > + return perfmon; > +} > + > +void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv) > +{ > + mutex_init(&v3d_priv->perfmon.lock); > + idr_init(&v3d_priv->perfmon.idr); > +} > + > +static int v3d_perfmon_idr_del(int id, void *elem, void *data) > +{ > + struct v3d_perfmon *perfmon = elem; > + > + v3d_perfmon_put(perfmon); > + > + return 0; > +} > + > +void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv) > +{ > + mutex_lock(&v3d_priv->perfmon.lock); > + idr_for_each(&v3d_priv->perfmon.idr, v3d_perfmon_idr_del, NULL); > + idr_destroy(&v3d_priv->perfmon.idr); > + mutex_unlock(&v3d_priv->perfmon.lock); > +} > + > +int v3d_perfmon_create_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv) > +{ > + struct v3d_file_priv *v3d_priv = file_priv->driver_priv; > + struct drm_v3d_perfmon_create *req = data; > + struct v3d_perfmon *perfmon; > + unsigned int i; > + int ret; > + > + /* Number of monitored counters cannot exceed HW limits. */ > + if (req->ncounters > DRM_V3D_MAX_PERF_COUNTERS || > + !req->ncounters) > + return -EINVAL; > + > + /* Make sure all counters are valid. */ > + for (i = 0; i < req->ncounters; i++) { > + if (req->counters[i] >= V3D_PERFCNT_NUM) > + return -EINVAL; > + } > + > + perfmon = kzalloc(struct_size(perfmon, values, req->ncounters), > + GFP_KERNEL); > + if (!perfmon) > + return -ENOMEM; > + > + for (i = 0; i < req->ncounters; i++) > + perfmon->counters[i] = req->counters[i]; > + > + perfmon->ncounters = req->ncounters; > + > + refcount_set(&perfmon->refcnt, 1); > + mutex_init(&perfmon->lock); > + > + mutex_lock(&v3d_priv->perfmon.lock); > + ret = idr_alloc(&v3d_priv->perfmon.idr, perfmon, V3D_PERFMONID_MIN, > + V3D_PERFMONID_MAX, GFP_KERNEL); > + mutex_unlock(&v3d_priv->perfmon.lock); > + > + if (ret < 0) { > + kfree(perfmon); > + return ret; > + } > + > + req->id = ret; > + > + return 0; > +} > + > +int v3d_perfmon_destroy_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv) > +{ > + struct v3d_file_priv *v3d_priv = file_priv->driver_priv; > + struct drm_v3d_perfmon_destroy *req = data; > + struct v3d_perfmon *perfmon; > + > + mutex_lock(&v3d_priv->perfmon.lock); > + perfmon = idr_remove(&v3d_priv->perfmon.idr, req->id); > + mutex_unlock(&v3d_priv->perfmon.lock); > + > + if (!perfmon) > + return -EINVAL; > + > + v3d_perfmon_put(perfmon); > + > + return 0; > +} > + > +int v3d_perfmon_get_values_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv) > +{ > + struct v3d_dev *v3d = to_v3d_dev(dev); > + struct v3d_file_priv *v3d_priv = file_priv->driver_priv; > + struct drm_v3d_perfmon_get_values *req = data; > + struct v3d_perfmon *perfmon; > + int ret = 0; > + > + if (req->pad != 0) > + return -EINVAL; > + > + mutex_lock(&v3d_priv->perfmon.lock); > + perfmon = idr_find(&v3d_priv->perfmon.idr, req->id); > + v3d_perfmon_get(perfmon); > + mutex_unlock(&v3d_priv->perfmon.lock); > + > + if (!perfmon) > + return -EINVAL; > + > + v3d_perfmon_stop(v3d, perfmon, true); > + > + if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->values, > + perfmon->ncounters * sizeof(u64))) > + ret = -EFAULT; > + > + v3d_perfmon_put(perfmon); > + > + return ret; > +} > diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h > index 9bcb57781d31..3663e0d6bf76 100644 > --- a/drivers/gpu/drm/v3d/v3d_regs.h > +++ b/drivers/gpu/drm/v3d/v3d_regs.h > @@ -347,6 +347,8 @@ > /* Each src reg muxes four counters each. */ > #define V3D_V4_PCTR_0_SRC_0_3 0x00660 > #define V3D_V4_PCTR_0_SRC_28_31 0x0067c > +#define V3D_V4_PCTR_0_SRC_X(x) (V3D_V4_PCTR_0_SRC_0_3 + \ > + 4 * (x)) > # define V3D_PCTR_S0_MASK V3D_MASK(6, 0) > # define V3D_PCTR_S0_SHIFT 0 > # define V3D_PCTR_S1_MASK V3D_MASK(14, 8) > diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c > index 8992480c88fa..c9a5c916d6eb 100644 > --- a/drivers/gpu/drm/v3d/v3d_sched.c > +++ b/drivers/gpu/drm/v3d/v3d_sched.c > @@ -63,6 +63,16 @@ v3d_job_free(struct drm_sched_job *sched_job) > v3d_job_put(job); > } > > +static void > +v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job) > +{ > + if (job->perfmon != v3d->active_perfmon) > + v3d_perfmon_stop(v3d, v3d->active_perfmon, true); > + > + if (job->perfmon && v3d->active_perfmon != job->perfmon) > + v3d_perfmon_start(v3d, job->perfmon); > +} > + > /* > * Returns the fences that the job depends on, one by one. > * > @@ -120,6 +130,8 @@ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job) > trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno, > job->start, job->end); > > + v3d_switch_perfmon(v3d, &job->base); > + > /* Set the current and end address of the control list. > * Writing the end register is what starts the job. > */ > @@ -169,6 +181,8 @@ static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job) > trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno, > job->start, job->end); > > + v3d_switch_perfmon(v3d, &job->base); > + > /* XXX: Set the QCFG */ > > /* Set the current and end address of the control list. > @@ -240,6 +254,8 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) > > trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno); > > + v3d_switch_perfmon(v3d, &job->base); > + > for (i = 1; i <= 6; i++) > V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]); > /* CFG0 write kicks off the job. */ > diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h > index 1ce746e228d9..4104f22fb3d3 100644 > --- a/include/uapi/drm/v3d_drm.h > +++ b/include/uapi/drm/v3d_drm.h > @@ -38,6 +38,9 @@ extern "C" { > #define DRM_V3D_GET_BO_OFFSET 0x05 > #define DRM_V3D_SUBMIT_TFU 0x06 > #define DRM_V3D_SUBMIT_CSD 0x07 > +#define DRM_V3D_PERFMON_CREATE 0x08 > +#define DRM_V3D_PERFMON_DESTROY 0x09 > +#define DRM_V3D_PERFMON_GET_VALUES 0x0a > > #define DRM_IOCTL_V3D_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl) > #define DRM_IOCTL_V3D_WAIT_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo) > @@ -47,6 +50,12 @@ extern "C" { > #define DRM_IOCTL_V3D_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset) > #define DRM_IOCTL_V3D_SUBMIT_TFU DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu) > #define DRM_IOCTL_V3D_SUBMIT_CSD DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd) > +#define DRM_IOCTL_V3D_PERFMON_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_CREATE, \ > + struct drm_v3d_perfmon_create) > +#define DRM_IOCTL_V3D_PERFMON_DESTROY DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_DESTROY, \ > + struct drm_v3d_perfmon_destroy) > +#define DRM_IOCTL_V3D_PERFMON_GET_VALUES DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_GET_VALUES, \ > + struct drm_v3d_perfmon_get_values) > > #define DRM_V3D_SUBMIT_CL_FLUSH_CACHE 0x01 > > @@ -127,6 +136,11 @@ struct drm_v3d_submit_cl { > __u32 bo_handle_count; > > __u32 flags; > + > + /* ID of the perfmon to attach to this job. 0 means no perfmon. */ > + __u32 perfmon_id; > + > + __u32 pad; > }; > > /** > @@ -195,6 +209,7 @@ enum drm_v3d_param { > DRM_V3D_PARAM_SUPPORTS_TFU, > DRM_V3D_PARAM_SUPPORTS_CSD, > DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH, > + DRM_V3D_PARAM_SUPPORTS_PERFMON, > }; > > struct drm_v3d_get_param { > @@ -258,6 +273,127 @@ struct drm_v3d_submit_csd { > __u32 in_sync; > /* Sync object to signal when the CSD job is done. */ > __u32 out_sync; > + > + /* ID of the perfmon to attach to this job. 0 means no perfmon. */ > + __u32 perfmon_id; > +}; > + > +enum { > + V3D_PERFCNT_FEP_VALID_PRIMTS_NO_PIXELS, > + V3D_PERFCNT_FEP_VALID_PRIMS, > + V3D_PERFCNT_FEP_EZ_NFCLIP_QUADS, > + V3D_PERFCNT_FEP_VALID_QUADS, > + V3D_PERFCNT_TLB_QUADS_STENCIL_FAIL, > + V3D_PERFCNT_TLB_QUADS_STENCILZ_FAIL, > + V3D_PERFCNT_TLB_QUADS_STENCILZ_PASS, > + V3D_PERFCNT_TLB_QUADS_ZERO_COV, > + V3D_PERFCNT_TLB_QUADS_NONZERO_COV, > + V3D_PERFCNT_TLB_QUADS_WRITTEN, > + V3D_PERFCNT_PTB_PRIM_VIEWPOINT_DISCARD, > + V3D_PERFCNT_PTB_PRIM_CLIP, > + V3D_PERFCNT_PTB_PRIM_REV, > + V3D_PERFCNT_QPU_IDLE_CYCLES, > + V3D_PERFCNT_QPU_ACTIVE_CYCLES_VERTEX_COORD_USER, > + V3D_PERFCNT_QPU_ACTIVE_CYCLES_FRAG, > + V3D_PERFCNT_QPU_CYCLES_VALID_INSTR, > + V3D_PERFCNT_QPU_CYCLES_TMU_STALL, > + V3D_PERFCNT_QPU_CYCLES_SCOREBOARD_STALL, > + V3D_PERFCNT_QPU_CYCLES_VARYINGS_STALL, > + V3D_PERFCNT_QPU_IC_HIT, > + V3D_PERFCNT_QPU_IC_MISS, > + V3D_PERFCNT_QPU_UC_HIT, > + V3D_PERFCNT_QPU_UC_MISS, > + V3D_PERFCNT_TMU_TCACHE_ACCESS, > + V3D_PERFCNT_TMU_TCACHE_MISS, > + V3D_PERFCNT_VPM_VDW_STALL, > + V3D_PERFCNT_VPM_VCD_STALL, > + V3D_PERFCNT_BIN_ACTIVE, > + V3D_PERFCNT_RDR_ACTIVE, > + V3D_PERFCNT_L2T_HITS, > + V3D_PERFCNT_L2T_MISSES, > + V3D_PERFCNT_CYCLE_COUNT, > + V3D_PERFCNT_QPU_CYCLES_STALLED_VERTEX_COORD_USER, > + V3D_PERFCNT_QPU_CYCLES_STALLED_FRAGMENT, > + V3D_PERFCNT_PTB_PRIMS_BINNED, > + V3D_PERFCNT_AXI_WRITES_WATCH_0, > + V3D_PERFCNT_AXI_READS_WATCH_0, > + V3D_PERFCNT_AXI_WRITE_STALLS_WATCH_0, > + V3D_PERFCNT_AXI_READ_STALLS_WATCH_0, > + V3D_PERFCNT_AXI_WRITE_BYTES_WATCH_0, > + V3D_PERFCNT_AXI_READ_BYTES_WATCH_0, > + V3D_PERFCNT_AXI_WRITES_WATCH_1, > + V3D_PERFCNT_AXI_READS_WATCH_1, > + V3D_PERFCNT_AXI_WRITE_STALLS_WATCH_1, > + V3D_PERFCNT_AXI_READ_STALLS_WATCH_1, > + V3D_PERFCNT_AXI_WRITE_BYTES_WATCH_1, > + V3D_PERFCNT_AXI_READ_BYTES_WATCH_1, > + V3D_PERFCNT_TLB_PARTIAL_QUADS, > + V3D_PERFCNT_TMU_CONFIG_ACCESSES, > + V3D_PERFCNT_L2T_NO_ID_STALL, > + V3D_PERFCNT_L2T_COM_QUE_STALL, > + V3D_PERFCNT_L2T_TMU_WRITES, > + V3D_PERFCNT_TMU_ACTIVE_CYCLES, > + V3D_PERFCNT_TMU_STALLED_CYCLES, > + V3D_PERFCNT_CLE_ACTIVE, > + V3D_PERFCNT_L2T_TMU_READS, > + V3D_PERFCNT_L2T_CLE_READS, > + V3D_PERFCNT_L2T_VCD_READS, > + V3D_PERFCNT_L2T_TMUCFG_READS, > + V3D_PERFCNT_L2T_SLC0_READS, > + V3D_PERFCNT_L2T_SLC1_READS, > + V3D_PERFCNT_L2T_SLC2_READS, > + V3D_PERFCNT_L2T_TMU_W_MISSES, > + V3D_PERFCNT_L2T_TMU_R_MISSES, > + V3D_PERFCNT_L2T_CLE_MISSES, > + V3D_PERFCNT_L2T_VCD_MISSES, > + V3D_PERFCNT_L2T_TMUCFG_MISSES, > + V3D_PERFCNT_L2T_SLC0_MISSES, > + V3D_PERFCNT_L2T_SLC1_MISSES, > + V3D_PERFCNT_L2T_SLC2_MISSES, > + V3D_PERFCNT_CORE_MEM_WRITES, > + V3D_PERFCNT_L2T_MEM_WRITES, > + V3D_PERFCNT_PTB_MEM_WRITES, > + V3D_PERFCNT_TLB_MEM_WRITES, > + V3D_PERFCNT_CORE_MEM_READS, > + V3D_PERFCNT_L2T_MEM_READS, > + V3D_PERFCNT_PTB_MEM_READS, > + V3D_PERFCNT_PSE_MEM_READS, > + V3D_PERFCNT_TLB_MEM_READS, > + V3D_PERFCNT_GMP_MEM_READS, > + V3D_PERFCNT_PTB_W_MEM_WORDS, > + V3D_PERFCNT_TLB_W_MEM_WORDS, > + V3D_PERFCNT_PSE_R_MEM_WORDS, > + V3D_PERFCNT_TLB_R_MEM_WORDS, > + V3D_PERFCNT_TMU_MRU_HITS, > + V3D_PERFCNT_COMPUTE_ACTIVE, > + V3D_PERFCNT_NUM, > +}; > + > +#define DRM_V3D_MAX_PERF_COUNTERS 32 > + > +struct drm_v3d_perfmon_create { > + __u32 id; > + __u32 ncounters; > + __u8 counters[DRM_V3D_MAX_PERF_COUNTERS]; > +}; > + > +struct drm_v3d_perfmon_destroy { > + __u32 id; > +}; > + > +/* > + * Returns the values of the performance counters tracked by this > + * perfmon (as an array of ncounters u64 values). > + * > + * No implicit synchronization is performed, so the user has to > + * guarantee that any jobs using this perfmon have already been > + * completed (probably by blocking on the seqno returned by the > + * last exec that used the perfmon). > + */ > +struct drm_v3d_perfmon_get_values { > + __u32 id; > + __u32 pad; > + __u64 values_ptr; > }; > > #if defined(__cplusplus) > -- > 2.25.1 >