Re: [PATCH 2/2] drm/xe: improve hibernation on igpu

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Nov 01, 2024 at 03:47:26PM +0000, Matthew Auld wrote:
> The GGTT looks to be stored inside stolen memory on igpu which is not
> treated as normal RAM.  The core kernel skips this memory range when
> creating the hibernation image, therefore when coming back from
> hibernation the GGTT programming is lost. This seems to cause issues
> with broken resume where GuC FW fails to load:
> 
> [drm] *ERROR* GT0: load failed: status = 0x400000A0, time = 10ms, freq = 1250MHz (req 1300MHz), done = -1
> [drm] *ERROR* GT0: load failed: status: Reset = 0, BootROM = 0x50, UKernel = 0x00, MIA = 0x00, Auth = 0x01
> [drm] *ERROR* GT0: firmware signature verification failed
> [drm] *ERROR* CRITICAL: Xe has declared device 0000:00:02.0 as wedged.
> 
> Current GGTT users are kernel internal and tracked as pinned, so it
> should be possible to hook into the existing save/restore logic that we
> use for dgpu, where the actual evict is skipped but on restore we
> importantly restore the GGTT programming.  This has been confirmed to
> fix hibernation on at least ADL and MTL, though likely all igpu
> platforms are affected.
> 
> This also means we have a hole in our testing, where the existing s4
> tests only really test the driver hooks, and don't go as far as actually
> rebooting and restoring from the hibernation image and in turn powering
> down RAM (and therefore losing the contents of stolen).
> 
> Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
> Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3275
> Signed-off-by: Matthew Auld <matthew.auld@xxxxxxxxx>
> Cc: Matthew Brost <matthew.brost@xxxxxxxxx>
> Cc: <stable@xxxxxxxxxxxxxxx> # v6.8+
> ---
>  drivers/gpu/drm/xe/xe_bo.c       | 36 ++++++++++++++------------------
>  drivers/gpu/drm/xe/xe_bo_evict.c |  6 ------
>  2 files changed, 16 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index d79d8ef5c7d5..0ae5c8f7bab8 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -950,7 +950,10 @@ int xe_bo_restore_pinned(struct xe_bo *bo)
>  	if (WARN_ON(!xe_bo_is_pinned(bo)))
>  		return -EINVAL;
>  
> -	if (WARN_ON(xe_bo_is_vram(bo) || !bo->ttm.ttm))
> +	if (WARN_ON(xe_bo_is_vram(bo)))
> +		return -EINVAL;
> +
> +	if (WARN_ON(!bo->ttm.ttm && !xe_bo_is_stolen(bo)))
>  		return -EINVAL;
>  
>  	if (!mem_type_is_vram(place->mem_type))
> @@ -1770,6 +1773,7 @@ int xe_bo_pin_external(struct xe_bo *bo)
>  
>  int xe_bo_pin(struct xe_bo *bo)
>  {
> +	struct ttm_place *place = &(bo->placements[0]);

Checkpatch will complain about this (extra parentheses).

>  	struct xe_device *xe = xe_bo_device(bo);
>  	int err;
>  
> @@ -1800,7 +1804,6 @@ int xe_bo_pin(struct xe_bo *bo)
>  	 */
>  	if (IS_DGFX(xe) && !(IS_ENABLED(CONFIG_DRM_XE_DEBUG) &&
>  	    bo->flags & XE_BO_FLAG_INTERNAL_TEST)) {
> -		struct ttm_place *place = &(bo->placements[0]);
>  

I think you have an extra line.

>  		if (mem_type_is_vram(place->mem_type)) {
>  			xe_assert(xe, place->flags & TTM_PL_FLAG_CONTIGUOUS);
> @@ -1809,13 +1812,12 @@ int xe_bo_pin(struct xe_bo *bo)
>  				       vram_region_gpu_offset(bo->ttm.resource)) >> PAGE_SHIFT;
>  			place->lpfn = place->fpfn + (bo->size >> PAGE_SHIFT);
>  		}
> +	}
>  
> -		if (mem_type_is_vram(place->mem_type) ||
> -		    bo->flags & XE_BO_FLAG_GGTT) {
> -			spin_lock(&xe->pinned.lock);
> -			list_add_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present);
> -			spin_unlock(&xe->pinned.lock);
> -		}
> +	if (mem_type_is_vram(place->mem_type) || bo->flags & XE_BO_FLAG_GGTT) {
> +		spin_lock(&xe->pinned.lock);
> +		list_add_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present);
> +		spin_unlock(&xe->pinned.lock);
>  	}
>  
>  	ttm_bo_pin(&bo->ttm);
> @@ -1863,24 +1865,18 @@ void xe_bo_unpin_external(struct xe_bo *bo)
>  
>  void xe_bo_unpin(struct xe_bo *bo)
>  {
> +	struct ttm_place *place = &(bo->placements[0]);

Same here.

>  	struct xe_device *xe = xe_bo_device(bo);
>  
>  	xe_assert(xe, !bo->ttm.base.import_attach);
>  	xe_assert(xe, xe_bo_is_pinned(bo));
>  
> -	if (IS_DGFX(xe) && !(IS_ENABLED(CONFIG_DRM_XE_DEBUG) &&
> -	    bo->flags & XE_BO_FLAG_INTERNAL_TEST)) {
> -		struct ttm_place *place = &(bo->placements[0]);
> -
> -		if (mem_type_is_vram(place->mem_type) ||
> -		    bo->flags & XE_BO_FLAG_GGTT) {
> -			spin_lock(&xe->pinned.lock);
> -			xe_assert(xe, !list_empty(&bo->pinned_link));
> -			list_del_init(&bo->pinned_link);
> -			spin_unlock(&xe->pinned.lock);
> -		}
> +	if (mem_type_is_vram(place->mem_type) || bo->flags & XE_BO_FLAG_GGTT) {
> +		spin_lock(&xe->pinned.lock);
> +		xe_assert(xe, !list_empty(&bo->pinned_link));
> +		list_del_init(&bo->pinned_link);
> +		spin_unlock(&xe->pinned.lock);
>  	}
> -
>  	ttm_bo_unpin(&bo->ttm);
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_bo_evict.c b/drivers/gpu/drm/xe/xe_bo_evict.c
> index 32043e1e5a86..b01bc20eb90b 100644
> --- a/drivers/gpu/drm/xe/xe_bo_evict.c
> +++ b/drivers/gpu/drm/xe/xe_bo_evict.c
> @@ -34,9 +34,6 @@ int xe_bo_evict_all(struct xe_device *xe)
>  	u8 id;
>  	int ret;
>  
> -	if (!IS_DGFX(xe))
> -		return 0;
> -
>  	/* User memory */
>  	for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
>  		struct ttm_resource_manager *man =
> @@ -125,9 +122,6 @@ int xe_bo_restore_kernel(struct xe_device *xe)
>  	struct xe_bo *bo;
>  	int ret;
>  
> -	if (!IS_DGFX(xe))
> -		return 0;
> -

Aside from the NITs, LGTM.

With that:
Reviewed-by: Matthew Brost <matthew.brost@xxxxxxxxx>

>  	spin_lock(&xe->pinned.lock);
>  	for (;;) {
>  		bo = list_first_entry_or_null(&xe->pinned.evicted,
> -- 
> 2.47.0
> 




[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux