Boris Brezillon <boris.brezillon@xxxxxxxxxxxxxxxxxx> writes: > The V3D engine has various hardware counters which might be interesting > to userspace performance analysis tools. > > Expose new ioctls to create/destroy a performance monitor object and > query the counter values of this perfmance monitor. > > Note that a perfomance monitor is given an ID that is only valid on the > file descriptor it has been allocated from. A performance monitor can be > attached to a CL submission and the driver will enable HW counters for > this request and update the performance monitor values at the end of the > job. > > Signed-off-by: Boris Brezillon <boris.brezillon@xxxxxxxxxxxxxxxxxx> > --- > Changes in v2: > - Get rid of the CL extension stuff > - Fix isolation of jobs when perfmon attached to them are different > - Add more comments in the code > - Use an SPDX header for vc4_perfmon.c > - Consider 0 as an invalid perfmonid to be backward compatible with mesa > versions that lack perfmon support > --- > drivers/gpu/drm/vc4/Makefile | 1 + > drivers/gpu/drm/vc4/vc4_drv.c | 26 ++++++ > drivers/gpu/drm/vc4/vc4_drv.h | 68 ++++++++++++++ > drivers/gpu/drm/vc4/vc4_gem.c | 48 +++++++++- > drivers/gpu/drm/vc4/vc4_irq.c | 40 +++++++- > drivers/gpu/drm/vc4/vc4_perfmon.c | 188 ++++++++++++++++++++++++++++++++++++++ > drivers/gpu/drm/vc4/vc4_regs.h | 35 +------ > drivers/gpu/drm/vc4/vc4_v3d.c | 64 ++++++------- > include/uapi/drm/vc4_drm.h | 67 ++++++++++++++ > 9 files changed, 465 insertions(+), 72 deletions(-) > create mode 100644 drivers/gpu/drm/vc4/vc4_perfmon.c > > diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile > index f5500df51686..4a3a868235f8 100644 > --- a/drivers/gpu/drm/vc4/Makefile > +++ b/drivers/gpu/drm/vc4/Makefile > @@ -15,6 +15,7 @@ vc4-y := \ > vc4_vec.o \ > vc4_hvs.o \ > vc4_irq.o \ > + vc4_perfmon.o \ > vc4_plane.o \ > vc4_render_cl.o \ > vc4_trace_points.o \ > diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c > index ceb385fd69c5..94b99c90425a 100644 > --- a/drivers/gpu/drm/vc4/vc4_drv.c > +++ b/drivers/gpu/drm/vc4/vc4_drv.c > @@ -101,6 +101,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data, > case DRM_VC4_PARAM_SUPPORTS_THREADED_FS: > case DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER: > case DRM_VC4_PARAM_SUPPORTS_MADVISE: > + case DRM_VC4_PARAM_SUPPORTS_PERFMON: > args->value = true; > break; > default: > @@ -111,6 +112,26 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data, > return 0; > } > > +static int vc4_open(struct drm_device *dev, struct drm_file *file) > +{ > + struct vc4_file *vc4file; > + > + vc4file = kzalloc(sizeof(*vc4file), GFP_KERNEL); > + if (!vc4file) > + return -ENOMEM; > + > + vc4_perfmon_open_file(vc4file); > + file->driver_priv = vc4file; > + return 0; > +} > + > +static void vc4_close(struct drm_device *dev, struct drm_file *file) > +{ > + struct vc4_file *vc4file = file->driver_priv; > + > + vc4_perfmon_close_file(vc4file); > +} > + > static const struct vm_operations_struct vc4_vm_ops = { > .fault = vc4_fault, > .open = drm_gem_vm_open, > @@ -143,6 +164,9 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = { > DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(VC4_GEM_MADVISE, vc4_gem_madvise_ioctl, DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(VC4_PERFMON_CREATE, vc4_perfmon_create_ioctl, DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(VC4_PERFMON_DESTROY, vc4_perfmon_destroy_ioctl, DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(VC4_PERFMON_GET_VALUES, vc4_perfmon_get_values_ioctl, DRM_RENDER_ALLOW), > }; > > static struct drm_driver vc4_drm_driver = { > @@ -153,6 +177,8 @@ static struct drm_driver vc4_drm_driver = { > DRIVER_RENDER | > DRIVER_PRIME), > .lastclose = drm_fb_helper_lastclose, > + .open = vc4_open, > + .postclose = vc4_close, > .irq_handler = vc4_irq, > .irq_preinstall = vc4_irq_preinstall, > .irq_postinstall = vc4_irq_postinstall, > diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h > index 3af22936d9b3..fefa1664a9f5 100644 > --- a/drivers/gpu/drm/vc4/vc4_drv.h > +++ b/drivers/gpu/drm/vc4/vc4_drv.h > @@ -11,6 +11,8 @@ > #include <drm/drm_encoder.h> > #include <drm/drm_gem_cma_helper.h> > > +#include "uapi/drm/vc4_drm.h" > + > /* Don't forget to update vc4_bo.c: bo_type_names[] when adding to > * this. > */ > @@ -29,6 +31,36 @@ enum vc4_kernel_bo_type { > VC4_BO_TYPE_COUNT > }; > > +/* Performance monitor object. The perform lifetime is controlled by userspace > + * using perfmon related ioctls. A perfmon can be attached to a submit_cl > + * request, and when this is the case, HW perf counters will be activated just > + * before the submit_cl is submitted to the GPU and disabled when the job is > + * done. This way, only events related to a specific job will be counted. > + */ > +struct vc4_perfmon { > + /* Tracks the number of users of the perfmon, when this counter reaches > + * zero the perfmon is destroyed. > + */ > + refcount_t refcnt; > + > + /* Number of counters activated in this perfmon instance > + * (should be less than DRM_VC4_MAX_PERF_COUNTERS). > + */ > + u8 ncounters; > + > + /* Events counted by the HW perf counters. */ > + u8 events[DRM_VC4_MAX_PERF_COUNTERS]; > + > + /* Storage for counter values. Counters are incremented by the HW > + * perf counter values every time the perfmon is attached to a GPU job. > + * This way, perfmon users don't have to retrieve the results after > + * each job if they want to track events covering several submissions. > + * Note that counter values can't be reset, but you can fake a reset by > + * destroying the perfmon and creating a new one. > + */ > + u64 counters[0]; > +}; > + > struct vc4_dev { > struct drm_device *dev; > > @@ -121,6 +153,11 @@ struct vc4_dev { > wait_queue_head_t job_wait_queue; > struct work_struct job_done_work; > > + /* Used to track the active perfmon if any. Access to this field is > + * protected by job_lock. > + */ > + struct vc4_perfmon *active_perfmon; > + > /* List of struct vc4_seqno_cb for callbacks to be made from a > * workqueue when the given seqno is passed. > */ > @@ -406,6 +443,21 @@ struct vc4_exec_info { > void *uniforms_v; > uint32_t uniforms_p; > uint32_t uniforms_size; > + > + /* Pointer to a performance monitor object if the user requested it, > + * NULL otherwise. > + */ > + struct vc4_perfmon *perfmon; > +}; > + > +/* Per-open file private data. Any driver-specific resource that has to be > + * released when the DRM file is closed should be placed here. > + */ > +struct vc4_file { > + struct { > + struct idr idr; > + struct mutex lock; > + } perfmon; > }; > > static inline struct vc4_exec_info * > @@ -646,3 +698,19 @@ bool vc4_check_tex_size(struct vc4_exec_info *exec, > /* vc4_validate_shader.c */ > struct vc4_validated_shader_info * > vc4_validate_shader(struct drm_gem_cma_object *shader_obj); > + > +/* vc4_perfmon.c */ > +void vc4_perfmon_get(struct vc4_perfmon *perfmon); > +void vc4_perfmon_put(struct vc4_perfmon *perfmon); > +void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon); > +void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon, > + bool capture); > +struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id); > +void vc4_perfmon_open_file(struct vc4_file *vc4file); > +void vc4_perfmon_close_file(struct vc4_file *vc4file); > +int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv); > +int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv); > +int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv); > diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c > index 19ac7fe0e5db..c0589d44e9e1 100644 > --- a/drivers/gpu/drm/vc4/vc4_gem.c > +++ b/drivers/gpu/drm/vc4/vc4_gem.c > @@ -454,14 +454,30 @@ vc4_submit_next_bin_job(struct drm_device *dev) > > vc4_flush_caches(dev); > > + /* Only start the perfmon if it was not already started by a previous > + * job. > + */ > + if (exec->perfmon && vc4->active_perfmon != exec->perfmon) > + vc4_perfmon_start(vc4, exec->perfmon); > + > /* Either put the job in the binner if it uses the binner, or > * immediately move it to the to-be-rendered queue. > */ > if (exec->ct0ca != exec->ct0ea) { > submit_cl(dev, 0, exec->ct0ca, exec->ct0ea); > } else { > + struct vc4_exec_info *next; > + > vc4_move_job_to_render(dev, exec); > - goto again; > + next = vc4_first_bin_job(vc4); > + > + /* We can't start the next bin job if the previous job had a > + * different perfmon instance attached to it. The same goes > + * if one of them had a perfmon attached to it and the other > + * one doesn't. > + */ > + if (next && next->perfmon == exec->perfmon) > + goto again; > } > } > > @@ -621,6 +637,7 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec, > struct ww_acquire_ctx *acquire_ctx) > { > struct vc4_dev *vc4 = to_vc4_dev(dev); > + struct vc4_exec_info *renderjob; > uint64_t seqno; > unsigned long irqflags; > struct vc4_fence *fence; > @@ -646,11 +663,14 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec, > > list_add_tail(&exec->head, &vc4->bin_job_list); > > - /* If no job was executing, kick ours off. Otherwise, it'll > - * get started when the previous job's flush done interrupt > - * occurs. > + /* If no bin job was executing and if the render job (if any) has the > + * same perfmon as our job attached to it (or if both jobs don't have > + * perfmon activated), then kick ours off. Otherwise, it'll get > + * started when the previous job's flush/render done interrupt occurs. > */ > - if (vc4_first_bin_job(vc4) == exec) { > + renderjob = vc4_first_render_job(vc4); > + if (vc4_first_bin_job(vc4) == exec && > + (!renderjob || renderjob->perfmon == exec->perfmon)) { > vc4_submit_next_bin_job(dev); > vc4_queue_hangcheck(dev); > } > @@ -913,6 +933,9 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec) > vc4->bin_alloc_used &= ~exec->bin_slots; > spin_unlock_irqrestore(&vc4->job_lock, irqflags); > > + /* Release the reference we had on the perf monitor. */ > + vc4_perfmon_put(exec->perfmon); > + > mutex_lock(&vc4->power_lock); > if (--vc4->power_refcount == 0) { > pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev); > @@ -1065,6 +1088,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data, > struct drm_file *file_priv) > { > struct vc4_dev *vc4 = to_vc4_dev(dev); > + struct vc4_file *vc4file = file_priv->driver_priv; > struct drm_vc4_submit_cl *args = data; > struct vc4_exec_info *exec; > struct ww_acquire_ctx acquire_ctx; > @@ -1078,6 +1102,11 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data, > return -EINVAL; > } > > + if (args->pad2 != 0) { > + DRM_DEBUG("->pad2 must be set to zero\n"); > + return -EINVAL; > + } > + > exec = kcalloc(1, sizeof(*exec), GFP_KERNEL); > if (!exec) { > DRM_ERROR("malloc failure on exec struct\n"); > @@ -1103,6 +1132,15 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data, > if (ret) > goto fail; > > + if (args->perfmonid) { > + exec->perfmon = vc4_perfmon_find(vc4file, > + args->perfmonid); > + if (!exec->perfmon) { > + ret = -ENOENT; > + goto fail; > + } > + } > + > if (exec->args->bin_cl_size != 0) { > ret = vc4_get_bcl(dev, exec); > if (ret) > diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c > index 61b2e5377993..0e0b37635646 100644 > --- a/drivers/gpu/drm/vc4/vc4_irq.c > +++ b/drivers/gpu/drm/vc4/vc4_irq.c > @@ -104,13 +104,20 @@ static void > vc4_irq_finish_bin_job(struct drm_device *dev) > { > struct vc4_dev *vc4 = to_vc4_dev(dev); > - struct vc4_exec_info *exec = vc4_first_bin_job(vc4); > + struct vc4_exec_info *next, *exec = vc4_first_bin_job(vc4); > > if (!exec) > return; > > vc4_move_job_to_render(dev, exec); > - vc4_submit_next_bin_job(dev); > + next = vc4_first_bin_job(vc4); > + > + /* Only submit the next job in the bin list if it matches the perfmon > + * attached to the one that just finished (or if both jobs don't have > + * perfmon attached to them). > + */ > + if (next && next->perfmon == exec->perfmon) > + vc4_submit_next_bin_job(dev); > } > > static void > @@ -122,6 +129,10 @@ vc4_cancel_bin_job(struct drm_device *dev) > if (!exec) > return; > > + /* Stop the perfmon so that the next bin job can be started. */ > + if (exec->perfmon) > + vc4_perfmon_stop(vc4, exec->perfmon, false); > + > list_move_tail(&exec->head, &vc4->bin_job_list); > vc4_submit_next_bin_job(dev); > } > @@ -131,17 +142,40 @@ vc4_irq_finish_render_job(struct drm_device *dev) > { > struct vc4_dev *vc4 = to_vc4_dev(dev); > struct vc4_exec_info *exec = vc4_first_render_job(vc4); > + struct vc4_exec_info *nextbin, *nextrender; > > if (!exec) > return; > > vc4->finished_seqno++; > list_move_tail(&exec->head, &vc4->job_done_list); > + > + nextbin = vc4_first_bin_job(vc4); > + nextrender = vc4_first_render_job(vc4); > + > + /* Only stop the perfmon if following jobs in the queue don't expect it > + * to be enabled. > + */ > + if (exec->perfmon && !nextrender && > + (!nextbin || nextbin->perfmon != exec->perfmon)) > + vc4_perfmon_stop(vc4, exec->perfmon, true); > + > + /* If there's a render job waiting, start it. If this is not the case > + * we may have to unblock the binner if it's been stalled because of > + * perfmon (this can be checked by comparing the perfmon attached to > + * the finished renderjob to the one attached to the next bin job: if > + * they don't match, this means the binner is stalled and should be > + * restarted). > + */ > + if (nextrender) > + vc4_submit_next_render_job(dev); > + else if (nextbin && nextbin->perfmon != exec->perfmon) > + vc4_submit_next_bin_job(dev); > + > if (exec->fence) { > dma_fence_signal_locked(exec->fence); > exec->fence = NULL; > } > - vc4_submit_next_render_job(dev); > > wake_up_all(&vc4->job_wait_queue); > schedule_work(&vc4->job_done_work); > diff --git a/drivers/gpu/drm/vc4/vc4_perfmon.c b/drivers/gpu/drm/vc4/vc4_perfmon.c > new file mode 100644 > index 000000000000..437e7a27f21d > --- /dev/null > +++ b/drivers/gpu/drm/vc4/vc4_perfmon.c > @@ -0,0 +1,188 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2018 Broadcom > + */ > + > +/** > + * DOC: VC4 V3D performance monitor module > + * > + * The V3D block provides 16 hardware counters which can count various events. > + */ > + > +#include "vc4_drv.h" > +#include "vc4_regs.h" > + > +#define VC4_PERFMONID_MIN 1 > +#define VC4_PERFMONID_MAX U32_MAX > + > +void vc4_perfmon_get(struct vc4_perfmon *perfmon) > +{ > + if (perfmon) > + refcount_inc(&perfmon->refcnt); > +} > + > +void vc4_perfmon_put(struct vc4_perfmon *perfmon) > +{ > + if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) > + kfree(perfmon); > +} > + > +void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon) > +{ > + unsigned int i; > + u32 mask; > + > + if (WARN_ON_ONCE(!perfmon || vc4->active_perfmon)) > + return; > + > + for (i = 0; i < perfmon->ncounters; i++) > + V3D_WRITE(V3D_PCTRS(i), perfmon->events[i]); > + > + mask = GENMASK(perfmon->ncounters - 1, 0); > + V3D_WRITE(V3D_PCTRC, mask); > + V3D_WRITE(V3D_PCTRE, V3D_PCTRE_EN | mask); > + vc4->active_perfmon = perfmon; > +} > + > +void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon, > + bool capture) > +{ > + unsigned int i; > + > + if (WARN_ON_ONCE(!vc4->active_perfmon || > + perfmon != vc4->active_perfmon)) > + return; > + > + if (capture) { > + for (i = 0; i < perfmon->ncounters; i++) > + perfmon->counters[i] += V3D_READ(V3D_PCTR(i)); > + } > + > + V3D_WRITE(V3D_PCTRE, 0); > + vc4->active_perfmon = NULL; > +} > + > +struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id) > +{ > + struct vc4_perfmon *perfmon; > + > + mutex_lock(&vc4file->perfmon.lock); > + perfmon = idr_find(&vc4file->perfmon.idr, id); > + vc4_perfmon_get(perfmon); > + mutex_unlock(&vc4file->perfmon.lock); > + > + return perfmon; > +} > + > +void vc4_perfmon_open_file(struct vc4_file *vc4file) > +{ > + mutex_init(&vc4file->perfmon.lock); > + idr_init(&vc4file->perfmon.idr); > +} > + > +static int vc4_perfmon_idr_del(int id, void *elem, void *data) > +{ > + struct vc4_perfmon *perfmon = elem; > + > + vc4_perfmon_put(perfmon); > + > + return 0; > +} > + > +void vc4_perfmon_close_file(struct vc4_file *vc4file) > +{ > + mutex_lock(&vc4file->perfmon.lock); > + idr_for_each(&vc4file->perfmon.idr, vc4_perfmon_idr_del, NULL); > + idr_destroy(&vc4file->perfmon.idr); > + mutex_unlock(&vc4file->perfmon.lock); > +} > + > +int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv) > +{ > + struct vc4_file *vc4file = file_priv->driver_priv; > + struct drm_vc4_perfmon_create *req = data; > + struct vc4_perfmon *perfmon; > + unsigned int i; > + int ret; > + > + /* Number of monitored counters cannot exceed HW limits. */ > + if (req->ncounters > DRM_VC4_MAX_PERF_COUNTERS || > + !req->ncounters) > + return -EINVAL; > + > + /* Make sure all events are valid. */ > + for (i = 0; i < req->ncounters; i++) { > + if (req->events[i] >= VC4_PERFCNT_NUM_EVENTS) > + return -EINVAL; > + } > + > + perfmon = kzalloc(sizeof(*perfmon) + (req->ncounters * sizeof(u64)), > + GFP_KERNEL); > + if (!perfmon) > + return -ENOMEM; > + > + for (i = 0; i < req->ncounters; i++) > + perfmon->events[i] = req->events[i]; > + > + perfmon->ncounters = req->ncounters; > + > + refcount_set(&perfmon->refcnt, 1); > + > + mutex_lock(&vc4file->perfmon.lock); > + ret = idr_alloc(&vc4file->perfmon.idr, perfmon, VC4_PERFMONID_MIN, > + VC4_PERFMONID_MAX, GFP_KERNEL); > + mutex_unlock(&vc4file->perfmon.lock); > + > + if (ret < 0) { > + kfree(perfmon); > + return ret; > + } > + > + req->id = ret; > + return 0; > +} > + > +int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv) > +{ > + struct vc4_file *vc4file = file_priv->driver_priv; > + struct drm_vc4_perfmon_destroy *req = data; > + struct vc4_perfmon *perfmon; > + > + mutex_lock(&vc4file->perfmon.lock); > + perfmon = idr_remove(&vc4file->perfmon.idr, req->id); > + mutex_unlock(&vc4file->perfmon.lock); > + > + if (!perfmon) > + return -EINVAL; > + > + vc4_perfmon_put(perfmon); > + return 0; > +} > + > +int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data, > + struct drm_file *file_priv) > +{ > + struct vc4_file *vc4file = file_priv->driver_priv; > + struct drm_vc4_perfmon_get_values *req = data; > + struct vc4_perfmon *perfmon; > + int ret; > + > + mutex_lock(&vc4file->perfmon.lock); > + perfmon = idr_find(&vc4file->perfmon.idr, req->id); > + vc4_perfmon_get(perfmon); > + mutex_unlock(&vc4file->perfmon.lock); > + > + if (!perfmon) > + return -EINVAL; > + > + if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->counters, > + perfmon->ncounters * sizeof(u64))) > + ret = -EFAULT; > + else > + ret = 0; > + > + vc4_perfmon_put(perfmon); > + return ret; > +} > diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h > index 55677bd50f66..b9749cb24063 100644 > --- a/drivers/gpu/drm/vc4/vc4_regs.h > +++ b/drivers/gpu/drm/vc4/vc4_regs.h > @@ -122,38 +122,9 @@ > #define V3D_VPMBASE 0x00504 > #define V3D_PCTRC 0x00670 > #define V3D_PCTRE 0x00674 > -#define V3D_PCTR0 0x00680 > -#define V3D_PCTRS0 0x00684 > -#define V3D_PCTR1 0x00688 > -#define V3D_PCTRS1 0x0068c > -#define V3D_PCTR2 0x00690 > -#define V3D_PCTRS2 0x00694 > -#define V3D_PCTR3 0x00698 > -#define V3D_PCTRS3 0x0069c > -#define V3D_PCTR4 0x006a0 > -#define V3D_PCTRS4 0x006a4 > -#define V3D_PCTR5 0x006a8 > -#define V3D_PCTRS5 0x006ac > -#define V3D_PCTR6 0x006b0 > -#define V3D_PCTRS6 0x006b4 > -#define V3D_PCTR7 0x006b8 > -#define V3D_PCTRS7 0x006bc > -#define V3D_PCTR8 0x006c0 > -#define V3D_PCTRS8 0x006c4 > -#define V3D_PCTR9 0x006c8 > -#define V3D_PCTRS9 0x006cc > -#define V3D_PCTR10 0x006d0 > -#define V3D_PCTRS10 0x006d4 > -#define V3D_PCTR11 0x006d8 > -#define V3D_PCTRS11 0x006dc > -#define V3D_PCTR12 0x006e0 > -#define V3D_PCTRS12 0x006e4 > -#define V3D_PCTR13 0x006e8 > -#define V3D_PCTRS13 0x006ec > -#define V3D_PCTR14 0x006f0 > -#define V3D_PCTRS14 0x006f4 > -#define V3D_PCTR15 0x006f8 > -#define V3D_PCTRS15 0x006fc > +# define V3D_PCTRE_EN BIT(31) > +#define V3D_PCTR(x) (0x00680 + ((x) * 8)) > +#define V3D_PCTRS(x) (0x00684 + ((x) * 8)) > #define V3D_DBGE 0x00f00 > #define V3D_FDBGO 0x00f04 > #define V3D_FDBGB 0x00f08 > diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c > index 622cd43840b8..35c00050d18b 100644 > --- a/drivers/gpu/drm/vc4/vc4_v3d.c > +++ b/drivers/gpu/drm/vc4/vc4_v3d.c > @@ -68,38 +68,38 @@ static const struct { > REGDEF(V3D_VPMBASE), > REGDEF(V3D_PCTRC), > REGDEF(V3D_PCTRE), > - REGDEF(V3D_PCTR0), > - REGDEF(V3D_PCTRS0), > - REGDEF(V3D_PCTR1), > - REGDEF(V3D_PCTRS1), > - REGDEF(V3D_PCTR2), > - REGDEF(V3D_PCTRS2), > - REGDEF(V3D_PCTR3), > - REGDEF(V3D_PCTRS3), > - REGDEF(V3D_PCTR4), > - REGDEF(V3D_PCTRS4), > - REGDEF(V3D_PCTR5), > - REGDEF(V3D_PCTRS5), > - REGDEF(V3D_PCTR6), > - REGDEF(V3D_PCTRS6), > - REGDEF(V3D_PCTR7), > - REGDEF(V3D_PCTRS7), > - REGDEF(V3D_PCTR8), > - REGDEF(V3D_PCTRS8), > - REGDEF(V3D_PCTR9), > - REGDEF(V3D_PCTRS9), > - REGDEF(V3D_PCTR10), > - REGDEF(V3D_PCTRS10), > - REGDEF(V3D_PCTR11), > - REGDEF(V3D_PCTRS11), > - REGDEF(V3D_PCTR12), > - REGDEF(V3D_PCTRS12), > - REGDEF(V3D_PCTR13), > - REGDEF(V3D_PCTRS13), > - REGDEF(V3D_PCTR14), > - REGDEF(V3D_PCTRS14), > - REGDEF(V3D_PCTR15), > - REGDEF(V3D_PCTRS15), > + REGDEF(V3D_PCTR(0)), > + REGDEF(V3D_PCTRS(0)), > + REGDEF(V3D_PCTR(1)), > + REGDEF(V3D_PCTRS(1)), > + REGDEF(V3D_PCTR(2)), > + REGDEF(V3D_PCTRS(2)), > + REGDEF(V3D_PCTR(3)), > + REGDEF(V3D_PCTRS(3)), > + REGDEF(V3D_PCTR(4)), > + REGDEF(V3D_PCTRS(4)), > + REGDEF(V3D_PCTR(5)), > + REGDEF(V3D_PCTRS(5)), > + REGDEF(V3D_PCTR(6)), > + REGDEF(V3D_PCTRS(6)), > + REGDEF(V3D_PCTR(7)), > + REGDEF(V3D_PCTRS(7)), > + REGDEF(V3D_PCTR(8)), > + REGDEF(V3D_PCTRS(8)), > + REGDEF(V3D_PCTR(9)), > + REGDEF(V3D_PCTRS(9)), > + REGDEF(V3D_PCTR(10)), > + REGDEF(V3D_PCTRS(10)), > + REGDEF(V3D_PCTR(11)), > + REGDEF(V3D_PCTRS(11)), > + REGDEF(V3D_PCTR(12)), > + REGDEF(V3D_PCTRS(12)), > + REGDEF(V3D_PCTR(13)), > + REGDEF(V3D_PCTRS(13)), > + REGDEF(V3D_PCTR(14)), > + REGDEF(V3D_PCTRS(14)), > + REGDEF(V3D_PCTR(15)), > + REGDEF(V3D_PCTRS(15)), > REGDEF(V3D_DBGE), > REGDEF(V3D_FDBGO), > REGDEF(V3D_FDBGB), > diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h > index 52263b575bdc..324776c3bbac 100644 > --- a/include/uapi/drm/vc4_drm.h > +++ b/include/uapi/drm/vc4_drm.h > @@ -42,6 +42,9 @@ extern "C" { > #define DRM_VC4_GET_TILING 0x09 > #define DRM_VC4_LABEL_BO 0x0a > #define DRM_VC4_GEM_MADVISE 0x0b > +#define DRM_VC4_PERFMON_CREATE 0x0c > +#define DRM_VC4_PERFMON_DESTROY 0x0d > +#define DRM_VC4_PERFMON_GET_VALUES 0x0e > > #define DRM_IOCTL_VC4_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl) > #define DRM_IOCTL_VC4_WAIT_SEQNO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno) > @@ -55,6 +58,9 @@ extern "C" { > #define DRM_IOCTL_VC4_GET_TILING DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling) > #define DRM_IOCTL_VC4_LABEL_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo) > #define DRM_IOCTL_VC4_GEM_MADVISE DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise) > +#define DRM_IOCTL_VC4_PERFMON_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_CREATE, struct drm_vc4_perfmon_create) > +#define DRM_IOCTL_VC4_PERFMON_DESTROY DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_DESTROY, struct drm_vc4_perfmon_destroy) > +#define DRM_IOCTL_VC4_PERFMON_GET_VALUES DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_GET_VALUES, struct drm_vc4_perfmon_get_values) > > struct drm_vc4_submit_rcl_surface { > __u32 hindex; /* Handle index, or ~0 if not present. */ > @@ -173,6 +179,15 @@ struct drm_vc4_submit_cl { > * wait ioctl). > */ > __u64 seqno; > + > + /* ID of the perfmon to attach to this job. 0 means no perfmon. */ > + __u32 perfmonid; > + > + /* Unused field to align this struct on 64 bits. Must be set to 0. > + * If one ever needs to add an u32 field to this struct, this field > + * can be used. > + */ > + __u32 pad2; > }; > > /** > @@ -308,6 +323,7 @@ struct drm_vc4_get_hang_state { > #define DRM_VC4_PARAM_SUPPORTS_THREADED_FS 5 > #define DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER 6 > #define DRM_VC4_PARAM_SUPPORTS_MADVISE 7 > +#define DRM_VC4_PARAM_SUPPORTS_PERFMON 8 > > struct drm_vc4_get_param { > __u32 param; > @@ -352,6 +368,57 @@ struct drm_vc4_gem_madvise { > __u32 pad; > }; > > +enum { > + VC4_PERFCNT_FEP_VALID_PRIMS_NO_RENDER, > + VC4_PERFCNT_FEP_VALID_PRIMS_RENDER, > + VC4_PERFCNT_FEP_CLIPPED_QUADS, > + VC4_PERFCNT_FEP_VALID_QUADS, > + VC4_PERFCNT_TLB_QUADS_NOT_PASSING_STENCIL, > + VC4_PERFCNT_TLB_QUADS_NOT_PASSING_Z_AND_STENCIL, > + VC4_PERFCNT_TLB_QUADS_PASSING_Z_AND_STENCIL, > + VC4_PERFCNT_TLB_QUADS_ZERO_COVERAGE, > + VC4_PERFCNT_TLB_QUADS_NON_ZERO_COVERAGE, > + VC4_PERFCNT_TLB_QUADS_WRITTEN_TO_COLOR_BUF, > + VC4_PERFCNT_PLB_PRIMS_OUTSIDE_VIEWPORT, > + VC4_PERFCNT_PLB_PRIMS_NEED_CLIPPING, > + VC4_PERFCNT_PSE_PRIMS_REVERSED, > + VC4_PERFCNT_QPU_TOTAL_IDLE_CYCLES, > + VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_VERTEX_COORD_SHADING, > + VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_FRAGMENT_SHADING, > + VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_EXEC_VALID_INST, > + VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_TMUS, > + VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_SCOREBOARD, > + VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_VARYINGS, > + VC4_PERFCNT_QPU_TOTAL_INST_CACHE_HIT, > + VC4_PERFCNT_QPU_TOTAL_INST_CACHE_MISS, > + VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_HIT, > + VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_MISS, > + VC4_PERFCNT_TMU_TOTAL_TEXT_QUADS_PROCESSED, > + VC4_PERFCNT_TMU_TOTAL_TEXT_CACHE_MISS, > + VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VDW_STALLED, > + VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VCD_STALLED, > + VC4_PERFCNT_L2C_TOTAL_L2_CACHE_HIT, > + VC4_PERFCNT_L2C_TOTAL_L2_CACHE_MISS, > + VC4_PERFCNT_NUM_EVENTS, > +}; > + > +#define DRM_VC4_MAX_PERF_COUNTERS 16 > + > +struct drm_vc4_perfmon_create { > + __u32 id; > + __u32 ncounters; > + __u8 events[DRM_VC4_MAX_PERF_COUNTERS]; > +}; > + > +struct drm_vc4_perfmon_destroy { > + __u32 id; > +}; > + Could we add some docs for get_values? Like: /* * Returns the values of the performance counters tracked by this * perfmon (as an array of ncounters u64 values). * * No implicit synchronization is performed, so the user has to * guarantee that any jobs using this perfmon have already been * completed (probably by blocking on the seqno returned by the * last exec that used the perfmon). */ With that, Reviewed-by: Eric Anholt <eric@xxxxxxxxxx> > +struct drm_vc4_perfmon_get_values { > + __u32 id; > + __u64 values_ptr; > +};
Attachment:
signature.asc
Description: PGP signature
_______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel