Add a work function to send a GPU reset uevent and scheduled it during a GPU reset. Co-developed-by: Shashank Sharma <shashank.sharma@xxxxxxx> Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx> Signed-off-by: André Almeida <andrealmeid@xxxxxxxxxx> --- V3: - Merge two last commits V2: Addressed review comments from Christian - Changed the name of the work to gpu_reset_event_work - Added a structure to accommodate some additional information (like a PID and some flags) - Do not add new structure in amdgpu.h --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 30 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 6b74df446694..88cb5b739c5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -60,6 +60,8 @@ #include <drm/amdgpu_drm.h> #include <drm/drm_gem.h> #include <drm/drm_ioctl.h> +#include <drm/gpu_scheduler.h> +#include <drm/drm_sysfs.h> #include <kgd_kfd_interface.h> #include "dm_pp_interface.h" @@ -1003,6 +1005,7 @@ struct amdgpu_device { int asic_reset_res; struct work_struct xgmi_reset_work; + struct work_struct gpu_reset_event_work; struct list_head reset_list; long gfx_timeout; @@ -1036,6 +1039,7 @@ struct amdgpu_device { pci_channel_state_t pci_channel_state; struct amdgpu_reset_control *reset_cntl; + struct drm_reset_event_info reset_event_info; uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE]; bool ram_is_direct_mapped; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b2b1c66bfe39..d04541fdb606 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -77,6 +77,7 @@ #include <linux/pm_runtime.h> #include <drm/drm_drv.h> +#include <drm/drm_sysfs.h> MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); @@ -3365,6 +3366,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) return amdgpu_device_asic_has_dc_support(adev->asic_type); } +static void amdgpu_device_reset_event_func(struct work_struct *__work) +{ + struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, + gpu_reset_event_work); + /* + * A GPU reset has happened, inform the userspace and pass the reset + * related information + */ + drm_sysfs_reset_event(&adev->ddev, &adev->reset_event_info); + + put_pid(adev->reset_event_info.pid); +} + static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = @@ -3616,6 +3630,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, amdgpu_device_delay_enable_gfx_off); INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); + INIT_WORK(&adev->gpu_reset_event_work, amdgpu_device_reset_event_func); adev->gfx.gfx_off_req_count = 1; adev->gfx.gfx_off_residency = 0; @@ -4920,6 +4935,21 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, goto out; vram_lost = amdgpu_device_check_vram_lost(tmp_adev); + + if (reset_context->job && reset_context->job->vm) { + tmp_adev->reset_event_info.pid = + find_get_pid(reset_context->job->vm->task_info.pid); + } else { + tmp_adev->reset_event_info.pid = NULL; + } + + if (vram_lost) + tmp_adev->reset_event_info.flags |= + DRM_RESET_EVENT_VRAM_LOST; + + /* Send GPU reset event */ + schedule_work(&tmp_adev->gpu_reset_event_work); + #ifdef CONFIG_DEV_COREDUMP tmp_adev->reset_vram_lost = vram_lost; memset(&tmp_adev->reset_task_info, 0, -- 2.38.1