On 11/28/2024 8:56 AM, Mario Limonciello wrote: > From: Mario Limonciello <mario.limonciello@xxxxxxx> > > As part of the suspend sequence VRAM needs to be evicted on dGPUs. > In order to make suspend/resume more reliable we moved this into > the pmops prepare() callback so that the suspend sequence would fail > but the system could remain operational under high memory usage suspend. > > Another class of issues exist though where due to memory fragementation > there isn't a large enough contiguous space and swap isn't accessible. > > Add support for a suspend/hibernate notification callback that could > evict VRAM before tasks are frozen. This should allow paging out to swap > if necessary. > > Link: https://github.com/ROCm/ROCK-Kernel-Driver/issues/174 > Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3476 > Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2362 > Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3781 > Signed-off-by: Mario Limonciello <mario.limonciello@xxxxxxx> Series is - Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx> Thanks, Lijo > --- > v5: > * Keep s4 set all the way through prepare. > v4: > * Make non fatal, drop patch 3 > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 +++++++++++++++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 - > 3 files changed, 46 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index d8bc6da500161..79ec4ab8ecfc5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -879,6 +879,7 @@ struct amdgpu_device { > bool need_swiotlb; > bool accel_working; > struct notifier_block acpi_nb; > + struct notifier_block pm_nb; > struct amdgpu_i2c_chan *i2c_bus[AMDGPU_MAX_I2C_BUS]; > struct debugfs_blob_wrapper debugfs_vbios_blob; > struct debugfs_blob_wrapper debugfs_discovery_blob; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 996e9c78384dd..dc6abeba61948 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -191,6 +191,8 @@ void amdgpu_set_init_level(struct amdgpu_device *adev, > } > > static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); > +static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, > + void *data); > > /** > * DOC: pcie_replay_count > @@ -4553,6 +4555,11 @@ int amdgpu_device_init(struct amdgpu_device *adev, > > amdgpu_device_check_iommu_direct_map(adev); > > + adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; > + r = register_pm_notifier(&adev->pm_nb); > + if (r) > + goto failed; > + > return 0; > > release_ras_con: > @@ -4617,6 +4624,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) > drain_workqueue(adev->mman.bdev.wq); > adev->shutdown = true; > > + unregister_pm_notifier(&adev->pm_nb); > + > /* make sure IB test finished before entering exclusive mode > * to avoid preemption on IB test > */ > @@ -4748,6 +4757,41 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) > /* > * Suspend & resume. > */ > +/** > + * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events > + * @nb: notifier block > + * @mode: suspend mode > + * @data: data > + * > + * This function is called when the system is about to suspend or hibernate. > + * It is used to evict resources from the device before the system goes to > + * sleep while there is still access to swap. > + */ > +static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, > + void *data) > +{ > + struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); > + int r; > + > + switch (mode) { > + case PM_HIBERNATION_PREPARE: > + adev->in_s4 = true; > + fallthrough; > + case PM_SUSPEND_PREPARE: > + r = amdgpu_device_evict_resources(adev); > + /* > + * This is considered non-fatal at this time because > + * amdgpu_device_prepare() will also fatally evict resources. > + * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 > + */ > + if (r) > + drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); > + break; > + } > + > + return NOTIFY_DONE; > +} > + > /** > * amdgpu_device_prepare - prepare for device suspend > * > @@ -4787,7 +4831,7 @@ int amdgpu_device_prepare(struct drm_device *dev) > return 0; > > unprepare: > - adev->in_s0ix = adev->in_s3 = false; > + adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; > > return r; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 38686203bea63..5e5ab0b2a7196 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -2552,7 +2552,6 @@ static int amdgpu_pmops_freeze(struct device *dev) > struct amdgpu_device *adev = drm_to_adev(drm_dev); > int r; > > - adev->in_s4 = true; > r = amdgpu_device_suspend(drm_dev, true); > adev->in_s4 = false; > if (r)