Once a user opens an fd for a perf event, if the driver undergoes a function level reset (FLR), the resources are not cleaned up as expected. For this discussion FLR is defined as a PCI unbind followed by a bind. perf_pmu_unregister() would cleanup everything, but when the user closes the perf fd, perf_release is executed and we encounter null pointer dereferences and/or list corruption in that path which require a reboot to recover. The only approach that worked to resolve this was to close the file associated with the event such that the relevant cleanup happens w.r.t. the open file. To do so, use the event->owner task and find the file relevant to the event and close it. This relies on the file->private_data matching the event object. Note: - Closing the event file is a delayed work that gets queued to system_wq. The close is seen to happen when kernel returns to user space following the unbind. - perf framework will access the pmu object after the last event has been destroyed. The drm device is refcounted in the init and destroy hooks, so this causes a use after free if we are releasing the drm device reference after unbind has been called. To work around this, we take an extra reference in the unbind path and release it using a delayed work in the destroy patch. The delayed work is queued to system_wq. Ref: https://lore.kernel.org/lkml/20240115170120.662220-1-tvrtko.ursulin@xxxxxxxxxxxxxxx/T/#me72abfa2771e6fc94b167ce47efdbf391cc313ab Opens: - Synchronization may be needed between i915_pmu_unregister and i915_pmu_event_destroy to avoid any races. - If unbind and bind happen from the same process the event fd is closed after bind completes. This means that the cleanup would not happen until bind completes. In this case, i915 loads fine, but pmu registration fails with an error that the sysfs entries are already present. There is no solution feasible here. Since this is not a fatal error (reloading i915 works fine) and the usual case is to have bind and unbind in separate processes, there is no intention to solve this. Other solutions/aspects tried: - Call perf_event_disable() followed by perf_event_release_kernel() in the unbind path to clean up the events. This still causes issues when user closes the fd since perf_event_release_kernel() is called again and fails requiring reboot. - Close all event fds in unbind and wait for the close to complete by checking if list is empty. This wait does not work since the files are actually closed when unbind returns to user space. Testing: - New IGT tests have been added for this and are run with KASAN and kmemleak enabled. Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_pmu.c | 96 ++++++++++++++++++++++++++++++++- drivers/gpu/drm/i915/i915_pmu.h | 15 ++++++ 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 4d2a289f848a..2f365c7f5db7 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -4,6 +4,8 @@ * Copyright © 2017-2018 Intel Corporation */ +#include <linux/fdtable.h> +#include <linux/fs.h> #include <linux/pm_runtime.h> #include "gt/intel_engine.h" @@ -573,9 +575,21 @@ static void i915_pmu_event_destroy(struct perf_event *event) { struct i915_pmu *pmu = event_to_pmu(event); struct drm_i915_private *i915 = pmu_to_i915(pmu); + struct i915_event *e = event->pmu_private; drm_WARN_ON(&i915->drm, event->parent); + if (e) { + event->pmu_private = NULL; + list_del(&e->link); + kfree(e); + } + + if (i915->pmu.closed && list_empty(&i915->pmu.initialized_events)) { + pmu_teardown(&i915->pmu); + mod_delayed_work(system_wq, &i915->pmu.work, 50); + } + drm_dev_put(&i915->drm); } @@ -684,6 +698,14 @@ static int i915_pmu_event_init(struct perf_event *event) return ret; if (!event->parent) { + struct i915_event *e = kzalloc(sizeof(*e), GFP_KERNEL); + + if (!e) + return -ENOMEM; + + e->event = event; + list_add(&e->link, &pmu->initialized_events); + event->pmu_private = e; drm_dev_get(&i915->drm); event->destroy = i915_pmu_event_destroy; } @@ -1256,6 +1278,14 @@ void i915_pmu_exit(void) cpuhp_remove_multi_state(cpuhp_slot); } +static void i915_pmu_release(struct work_struct *work) +{ + struct i915_pmu *pmu = container_of(work, typeof(*pmu), work.work); + struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu); + + drm_dev_put(&i915->drm); +} + void i915_pmu_register(struct drm_i915_private *i915) { struct i915_pmu *pmu = &i915->pmu; @@ -1313,6 +1343,9 @@ void i915_pmu_register(struct drm_i915_private *i915) pmu->base.read = i915_pmu_event_read; pmu->base.event_idx = i915_pmu_event_event_idx; + INIT_LIST_HEAD(&pmu->initialized_events); + INIT_DELAYED_WORK(&pmu->work, i915_pmu_release); + ret = perf_pmu_register(&pmu->base, pmu->name, -1); if (ret) goto err_groups; @@ -1337,6 +1370,64 @@ void i915_pmu_register(struct drm_i915_private *i915) drm_notice(&i915->drm, "Failed to register PMU!\n"); } +/* Ref: close_fd() */ +static unsigned int __open_files(struct fdtable *fdt) +{ + unsigned int size = fdt->max_fds; + unsigned int i; + + for (i = size / BITS_PER_LONG; i > 0; ) { + if (fdt->open_fds[--i]) + break; + } + return (i + 1) * BITS_PER_LONG; +} + +static void close_event_file(struct perf_event *event) +{ + unsigned int max_open_fds, fd; + struct files_struct *files; + struct task_struct *task; + struct fdtable *fdt; + + task = event->owner; + if (!task) + return; + + files = task->files; + if (!files) + return; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + max_open_fds = __open_files(fdt); + for (fd = 0; fd < max_open_fds; fd++) { + struct file *file = fdt->fd[fd]; + + if (!file || file->private_data != event) + continue; + + rcu_assign_pointer(fdt->fd[fd], NULL); + __clear_bit(fd, fdt->open_fds); + __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); + if (fd < files->next_fd) + files->next_fd = fd; + filp_close(file, files); + break; + } + spin_unlock(&files->file_lock); +} + +static void cleanup_events(struct i915_pmu *pmu) +{ + struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu); + struct i915_event *e, *tmp; + + drm_dev_get(&i915->drm); + list_for_each_entry_safe(e, tmp, &pmu->initialized_events, link) + close_event_file(e->event); +} + void i915_pmu_unregister(struct drm_i915_private *i915) { struct i915_pmu *pmu = &i915->pmu; @@ -1354,5 +1445,8 @@ void i915_pmu_unregister(struct drm_i915_private *i915) hrtimer_cancel(&pmu->timer); - pmu_teardown(pmu); + if (list_empty(&pmu->initialized_events)) + pmu_teardown(pmu); + else + cleanup_events(pmu); } diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h index 41af038c3738..6f62e820f34d 100644 --- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -55,6 +55,11 @@ struct i915_pmu_sample { u64 cur; }; +struct i915_event { + struct perf_event *event; + struct list_head link; +}; + struct i915_pmu { /** * @cpuhp: Struct used for CPU hotplug handling. @@ -152,6 +157,16 @@ struct i915_pmu { * @pmu_attr: Memory block holding device attributes. */ void *pmu_attr; + + /** + * @initialized_events: List of initialized events + */ + struct list_head initialized_events; + + /** + * @work: worker to delay release of drm device reference + */ + struct delayed_work work; }; #ifdef CONFIG_PERF_EVENTS -- 2.34.1