Re: [PATCH v6 11/12] drm/ttm, drm/xe: Add a shrinker for xe bos

Matthew Brost <matthew.brost@xxxxxxxxx> · Fri, 9 Aug 2024 17:22:58 +0000

On Fri, Aug 09, 2024 at 04:31:27PM +0200, Thomas Hellström wrote:
> On Thu, 2024-08-08 at 01:37 +0000, Matthew Brost wrote:
> > On Wed, Jul 03, 2024 at 05:38:12PM +0200, Thomas Hellström wrote:
> > > Rather than relying on the TTM watermark accounting add a shrinker
> > > for xe_bos in TT or system memory.
> > > 
> > > Leverage the newly added TTM per-page shrinking and shmem backup
> > > support.
> > > 
> > > Although xe doesn't fully support WONTNEED (purgeable) bos yet,
> > > introduce and add shrinker support for purgeable ttm_tts.
> > > 
> > > v2:
> > > - Cleanups bugfixes and a KUNIT shrinker test.
> > > - Add writeback support, and activate if kswapd.
> > > v3:
> > > - Move the try_shrink() helper to core TTM.
> > > - Minor cleanups.
> > > v4:
> > > - Add runtime pm for the shrinker. Shrinking may require an active
> > >   device for CCS metadata copying.
> > > v5:
> > > - Separately purge ghost- and zombie objects in the shrinker.
> > > - Fix a format specifier - type inconsistency. (Kernel test robot).
> > > 
> > > Cc: Christian König <christian.koenig@xxxxxxx>
> > > Cc: Somalapuram Amaranath <Amaranath.Somalapuram@xxxxxxx>
> > > Cc: Matthew Brost <matthew.brost@xxxxxxxxx>
> > > Cc: <dri-devel@xxxxxxxxxxxxxxxxxxxxx>
> > > Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx>
> > > ---
> > >  drivers/gpu/drm/ttm/ttm_bo_util.c     |  67 ++++++
> > >  drivers/gpu/drm/xe/Makefile           |   1 +
> > >  drivers/gpu/drm/xe/tests/xe_bo.c      | 118 +++++++++++
> > >  drivers/gpu/drm/xe/tests/xe_bo_test.c |   1 +
> > >  drivers/gpu/drm/xe/tests/xe_bo_test.h |   1 +
> > >  drivers/gpu/drm/xe/xe_bo.c            | 155 ++++++++++++--
> > >  drivers/gpu/drm/xe/xe_bo.h            |  26 +++
> > >  drivers/gpu/drm/xe/xe_device.c        |   8 +
> > >  drivers/gpu/drm/xe/xe_device_types.h  |   2 +
> > >  drivers/gpu/drm/xe/xe_shrinker.c      | 287
> > > ++++++++++++++++++++++++++
> > >  drivers/gpu/drm/xe/xe_shrinker.h      |  18 ++
> > >  include/drm/ttm/ttm_bo.h              |   3 +
> > >  12 files changed, 671 insertions(+), 16 deletions(-)
> > >  create mode 100644 drivers/gpu/drm/xe/xe_shrinker.c
> > >  create mode 100644 drivers/gpu/drm/xe/xe_shrinker.h
> > > 
> > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > index c4f678f30fc2..563e96a4cf06 100644
> > > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > > @@ -924,3 +924,70 @@ long ttm_lru_walk_for_evict(struct
> > > ttm_lru_walk *walk, struct ttm_device *bdev,
> > >  
> > >  	return progress;
> > >  }
> > > +EXPORT_SYMBOL(ttm_lru_walk_for_evict);
> > > +
> > > +/**
> > > + * ttm_bo_try_shrink - LRU walk helper to shrink a ttm buffer
> > > object.
> > > + * @walk: The struct xe_ttm_lru_walk that describes the walk.
> > > + * @bo: The buffer object.
> > > + * @purge: Whether to attempt to purge the bo content since it's
> > > no
> > > + * longer needed.
> > > + * @writeback: If !@purge, attempt to write out to persistent
> > > storage.
> > > + *
> > > + * The function uses the ttm_tt_back_up functionality to back up
> > > or
> > > + * purge a struct ttm_tt. If the bo is not in system, it's first
> > > + * moved there.
> > > + *
> > > + * Return: The number of pages shrunken or purged, or
> > > + * negative error code on failure.
> > > + */
> > > +long ttm_bo_try_shrink(struct ttm_lru_walk *walk, struct
> > > ttm_buffer_object *bo,
> > > +		       bool purge, bool writeback)
> > > +{
> > > +	static const struct ttm_place sys_placement_flags = {
> > > +		.fpfn = 0,
> > > +		.lpfn = 0,
> > > +		.mem_type = TTM_PL_SYSTEM,
> > > +		.flags = 0,
> > > +	};
> > > +	static struct ttm_placement sys_placement = {
> > > +		.num_placement = 1,
> > > +		.placement = &sys_placement_flags,
> > > +	};
> > > +	struct ttm_operation_ctx *ctx = walk->ctx;
> > > +	struct ttm_tt *tt = bo->ttm;
> > > +	long lret;
> > > +
> > > +	dma_resv_assert_held(bo->base.resv);
> > > +
> > > +	if (!tt || !ttm_tt_is_populated(tt))
> > > +		return 0;
> > > +
> > > +	if (bo->resource->mem_type != TTM_PL_SYSTEM) {
> > > +		int ret = ttm_bo_validate(bo, &sys_placement,
> > > ctx);
> > > +
> > > +		if (ret) {
> > > +			if (ret == -EINTR || ret == -EDEADLK ||
> > > +			    ret == -ERESTARTSYS)
> > > +				return ret;
> > 
> > Can you explain the various error code returns / supression in this
> > function?
> 
> Want me to add a comment in the code or inline here? Anyway, the error
> codes are codes for which the caller wants to restart. (Signal delivery
> or deadlock). For other errors just move on to the next bo on the LRU
> list.
> 

Yes, that would help make this clear and help ensure correctness.

> > 
> > > +			return 0;
> > > +		}
> > > +	}
> > > +
> > > +	lret = ttm_bo_wait_ctx(bo, ctx);
> > > +	if (lret < 0) {
> > > +		if (lret == -ERESTARTSYS)
> > > +			return lret;
> > > +		return 0;
> > > +	}
> > > +
> > > +	if (bo->deleted)
> > > +		lret = ttm_tt_backup(bo->bdev, tt, true,
> > > writeback);
> > > +	else
> > > +		lret = ttm_tt_backup(bo->bdev, tt, purge,
> > > writeback);
> > 
> > Hmm, missed this in my previous review. It is frowned upon having
> > multiple bools as arguments. Could this be reworked with flags? Same
> > goes for all functions in the series with multiple bool arguments.
> 
> I agree. Ill see if I can make this look betteer.
>

+1

> > 
> > > +	if (lret < 0 && lret != -EINTR)
> > > +		return 0;
> > > +
> > > +	return lret;
> > > +}
> > > +EXPORT_SYMBOL(ttm_bo_try_shrink);
> > > diff --git a/drivers/gpu/drm/xe/Makefile
> > > b/drivers/gpu/drm/xe/Makefile
> > > index b1e03bfe4a68..1eba51bdd172 100644
> > > --- a/drivers/gpu/drm/xe/Makefile
> > > +++ b/drivers/gpu/drm/xe/Makefile
> > > @@ -112,6 +112,7 @@ xe-y += xe_bb.o \
> > >  	xe_ring_ops.o \
> > >  	xe_sa.o \
> > >  	xe_sched_job.o \
> > > +	xe_shrinker.o \
> > >  	xe_step.o \
> > >  	xe_sync.o \
> > >  	xe_tile.o \
> > > diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c
> > > b/drivers/gpu/drm/xe/tests/xe_bo.c
> > > index 9f3c02826464..49617f16dc76 100644
> > > --- a/drivers/gpu/drm/xe/tests/xe_bo.c
> > > +++ b/drivers/gpu/drm/xe/tests/xe_bo.c
> > > @@ -6,6 +6,8 @@
> > >  #include <kunit/test.h>
> > >  #include <kunit/visibility.h>
> > >  
> > > +#include <uapi/linux/sysinfo.h>
> > > +
> > >  #include "tests/xe_bo_test.h"
> > >  #include "tests/xe_pci_test.h"
> > >  #include "tests/xe_test.h"
> > > @@ -350,3 +352,119 @@ void xe_bo_evict_kunit(struct kunit *test)
> > >  	xe_call_for_each_device(evict_test_run_device);
> > >  }
> > >  EXPORT_SYMBOL_IF_KUNIT(xe_bo_evict_kunit);
> > > +
> > > +struct xe_bo_link {
> > > +	struct list_head link;
> > > +	struct xe_bo *bo;
> > > +};
> > > +
> > > +#define XE_BO_SHRINK_SIZE ((unsigned long)SZ_64M)
> > > +
> > > +/*
> > > + * Try to create system bos corresponding to twice the amount
> > > + * of available system memory to test shrinker functionality.
> > > + * If no swap space is available to accommodate the
> > > + * memory overcommit, mark bos purgeable.
> > > + */
> > > +static int shrink_test_run_device(struct xe_device *xe)
> > > +{
> > > +	struct kunit *test = xe_cur_kunit();
> > > +	LIST_HEAD(bos);
> > > +	struct xe_bo_link *link, *next;
> > > +	struct sysinfo si;
> > > +	size_t total, alloced;
> > > +	unsigned int interrupted = 0, successful = 0;
> > > +
> > > +	si_meminfo(&si);
> > > +	total = si.freeram * si.mem_unit;
> > > +
> > > +	kunit_info(test, "Free ram is %lu bytes. Will allocate
> > > twice of that.\n",
> > > +		   (unsigned long) total);
> > > +
> > > +	total <<= 1;
> > > +	for (alloced = 0; alloced < total ; alloced +=
> > > XE_BO_SHRINK_SIZE) {
> > > +		struct xe_bo *bo;
> > > +		unsigned int mem_type;
> > > +
> > > +		link = kzalloc(sizeof(*link), GFP_KERNEL);
> > > +		if (!link) {
> > > +			KUNIT_FAIL(test, "Unexpeced link
> > > allocation failure\n");
> > > +			break;
> > > +		}
> > > +
> > > +		INIT_LIST_HEAD(&link->link);
> > > +
> > > +		/* We can create bos using WC caching here. But it
> > > is slower. */
> > > +		bo = xe_bo_create_user(xe, NULL, NULL,
> > > XE_BO_SHRINK_SIZE,
> > > +				       DRM_XE_GEM_CPU_CACHING_WB,
> > > +				       ttm_bo_type_device,
> > > +				       XE_BO_FLAG_SYSTEM);
> > > +		if (IS_ERR(bo)) {
> > > +			if (bo != ERR_PTR(-ENOMEM) && bo !=
> > > ERR_PTR(-ENOSPC) &&
> > > +			    bo != ERR_PTR(-EINTR) && bo !=
> > > ERR_PTR(-ERESTARTSYS))
> > > +				KUNIT_FAIL(test, "Error creating
> > > bo: %pe\n", bo);
> > > +			kfree(link);
> > > +			break;
> > > +		}
> > > +		link->bo = bo;
> > > +		list_add_tail(&link->link, &bos);
> > > +		xe_bo_lock(bo, false);
> > > +
> > > +		/*
> > > +		 * If we're low on swap entries, we can't shrink
> > > unless the bo
> > > +		 * is marked purgeable.
> > > +		 */
> > > +		if (get_nr_swap_pages() < (XE_BO_SHRINK_SIZE >>
> > > PAGE_SHIFT) * 128) {
> > > +			struct xe_ttm_tt *xe_tt =
> > > +				container_of(bo->ttm.ttm,
> > > typeof(*xe_tt), ttm);
> > > +			long num_pages = xe_tt->ttm.num_pages;
> > > +
> > > +			xe_tt->purgeable = true;
> > > +			xe_shrinker_mod_pages(xe->mem.shrinker, -
> > > num_pages,
> > > +					      num_pages);
> > > +		}
> > > +
> > > +		mem_type = bo->ttm.resource->mem_type;
> > > +		xe_bo_unlock(bo);
> > > +		if (mem_type != XE_PL_TT)
> > > +			KUNIT_FAIL(test, "Bo in incorrect memory
> > > type: %u\n",
> > > +				   bo->ttm.resource->mem_type);
> > > +		cond_resched();
> > > +		if (signal_pending(current))
> > > +			break;
> > > +	}
> > > +
> > > +	/* Read back and destroy bos */
> > > +	list_for_each_entry_safe_reverse(link, next, &bos, link) {
> > > +		static struct ttm_operation_ctx ctx =
> > > {.interruptible = true};
> > > +		struct xe_bo *bo = link->bo;
> > > +		int ret;
> > > +
> > > +		if (!signal_pending(current)) {
> > > +			xe_bo_lock(bo, NULL);
> > > +			ret = ttm_bo_validate(&bo->ttm,
> > > &tt_placement, &ctx);
> > > +			xe_bo_unlock(bo);
> > > +			if (ret && ret != -EINTR)
> > > +				KUNIT_FAIL(test, "Validation
> > > failed: %pe\n",
> > > +					   ERR_PTR(ret));
> > > +			else if (ret)
> > > +				interrupted++;
> > > +			else
> > > +				successful++;
> > > +		}
> > > +		xe_bo_put(link->bo);
> > > +		list_del(&link->link);
> > > +		kfree(link);
> > > +		cond_resched();
> > > +	}
> > > +	kunit_info(test, "Readbacks interrupted: %u successful:
> > > %u\n",
> > > +		   interrupted, successful);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +void xe_bo_shrink_kunit(struct kunit *test)
> > > +{
> > > +	xe_call_for_each_device(shrink_test_run_device);
> > > +}
> > > +EXPORT_SYMBOL_IF_KUNIT(xe_bo_shrink_kunit);
> > > diff --git a/drivers/gpu/drm/xe/tests/xe_bo_test.c
> > > b/drivers/gpu/drm/xe/tests/xe_bo_test.c
> > > index a324cde77db8..317fa923e287 100644
> > > --- a/drivers/gpu/drm/xe/tests/xe_bo_test.c
> > > +++ b/drivers/gpu/drm/xe/tests/xe_bo_test.c
> > > @@ -10,6 +10,7 @@
> > >  static struct kunit_case xe_bo_tests[] = {
> > >  	KUNIT_CASE(xe_ccs_migrate_kunit),
> > >  	KUNIT_CASE(xe_bo_evict_kunit),
> > > +	KUNIT_CASE_SLOW(xe_bo_shrink_kunit),
> > >  	{}
> > >  };
> > >  
> > > diff --git a/drivers/gpu/drm/xe/tests/xe_bo_test.h
> > > b/drivers/gpu/drm/xe/tests/xe_bo_test.h
> > > index 0113ab45066a..7f44d14a45c5 100644
> > > --- a/drivers/gpu/drm/xe/tests/xe_bo_test.h
> > > +++ b/drivers/gpu/drm/xe/tests/xe_bo_test.h
> > > @@ -10,5 +10,6 @@ struct kunit;
> > >  
> > >  void xe_ccs_migrate_kunit(struct kunit *test);
> > >  void xe_bo_evict_kunit(struct kunit *test);
> > > +void xe_bo_shrink_kunit(struct kunit *test);
> > >  
> > >  #endif
> > > diff --git a/drivers/gpu/drm/xe/xe_bo.c
> > > b/drivers/gpu/drm/xe/xe_bo.c
> > > index 65c696966e96..6ab63d1642ae 100644
> > > --- a/drivers/gpu/drm/xe/xe_bo.c
> > > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > > @@ -10,6 +10,7 @@
> > >  #include <drm/drm_drv.h>
> > >  #include <drm/drm_gem_ttm_helper.h>
> > >  #include <drm/drm_managed.h>
> > > +#include <drm/ttm/ttm_backup.h>
> > >  #include <drm/ttm/ttm_device.h>
> > >  #include <drm/ttm/ttm_placement.h>
> > >  #include <drm/ttm/ttm_tt.h>
> > > @@ -25,6 +26,7 @@
> > >  #include "xe_pm.h"
> > >  #include "xe_preempt_fence.h"
> > >  #include "xe_res_cursor.h"
> > > +#include "xe_shrinker.h"
> > >  #include "xe_trace_bo.h"
> > >  #include "xe_ttm_stolen_mgr.h"
> > >  #include "xe_vm.h"
> > > @@ -278,11 +280,15 @@ static void xe_evict_flags(struct
> > > ttm_buffer_object *tbo,
> > >  	}
> > >  }
> > >  
> > > +/* struct xe_ttm_tt - Subclassed ttm_tt for xe */
> > >  struct xe_ttm_tt {
> > >  	struct ttm_tt ttm;
> > > -	struct device *dev;
> > > +	/** @xe - The xe device */
> > > +	struct xe_device *xe;
> > >  	struct sg_table sgt;
> > >  	struct sg_table *sg;
> > > +	/** @purgeable - Whether the bo is purgeable (WONTNEED) */
> > 
> > So we need to add WONTNEED to our uAPI, right?
> 
> Not strictly, but we have work ongoing to implement this. Bos that UMD
> pool are typically WONTNEED and unless we mark them as such, shrinking
> of these will not work unless we have swap-space available, and then
> only with a completely unnecessary copy...
> 

Makes sense.

> > 
> > > +	bool purgeable;
> > >  };
> > >  
> > >  static int xe_tt_map_sg(struct ttm_tt *tt)
> > > @@ -291,7 +297,8 @@ static int xe_tt_map_sg(struct ttm_tt *tt)
> > >  	unsigned long num_pages = tt->num_pages;
> > >  	int ret;
> > >  
> > > -	XE_WARN_ON(tt->page_flags & TTM_TT_FLAG_EXTERNAL);
> > > +	XE_WARN_ON((tt->page_flags & TTM_TT_FLAG_EXTERNAL) &&
> > > +		   !(tt->page_flags &
> > > TTM_TT_FLAG_EXTERNAL_MAPPABLE));
> > >  
> > >  	if (xe_tt->sg)
> > >  		return 0;
> > > @@ -299,13 +306,13 @@ static int xe_tt_map_sg(struct ttm_tt *tt)
> > >  	ret = sg_alloc_table_from_pages_segment(&xe_tt->sgt, tt-
> > > >pages,
> > >  						num_pages, 0,
> > >  						(u64)num_pages <<
> > > PAGE_SHIFT,
> > > -
> > > 						xe_sg_segment_size(xe_tt->dev),
> > > +						xe_sg_segment_size
> > > (xe_tt->xe->drm.dev),
> > >  						GFP_KERNEL);
> > >  	if (ret)
> > >  		return ret;
> > >  
> > >  	xe_tt->sg = &xe_tt->sgt;
> > > -	ret = dma_map_sgtable(xe_tt->dev, xe_tt->sg,
> > > DMA_BIDIRECTIONAL,
> > > +	ret = dma_map_sgtable(xe_tt->xe->drm.dev, xe_tt->sg,
> > > DMA_BIDIRECTIONAL,
> > >  			      DMA_ATTR_SKIP_CPU_SYNC);
> > >  	if (ret) {
> > >  		sg_free_table(xe_tt->sg);
> > > @@ -321,7 +328,7 @@ static void xe_tt_unmap_sg(struct ttm_tt *tt)
> > >  	struct xe_ttm_tt *xe_tt = container_of(tt, struct
> > > xe_ttm_tt, ttm);
> > >  
> > >  	if (xe_tt->sg) {
> > > -		dma_unmap_sgtable(xe_tt->dev, xe_tt->sg,
> > > +		dma_unmap_sgtable(xe_tt->xe->drm.dev, xe_tt->sg,
> > >  				  DMA_BIDIRECTIONAL, 0);
> > >  		sg_free_table(xe_tt->sg);
> > >  		xe_tt->sg = NULL;
> > > @@ -336,21 +343,41 @@ struct sg_table *xe_bo_sg(struct xe_bo *bo)
> > >  	return xe_tt->sg;
> > >  }
> > >  
> > > +/*
> > > + * Account ttm pages against the device shrinker's shrinkable and
> > > + * purgeable counts.
> > > + */
> > > +static void xe_ttm_tt_account(struct ttm_tt *tt, bool add)
> > > +{
> > 
> > Again I think bools are frowned upon as arguments. Maybe just two
> > functions - add / sub?
> 
> OK,
> 
> > 
> > > +	struct xe_ttm_tt *xe_tt = container_of(tt, struct
> > > xe_ttm_tt, ttm);
> > > +	long num_pages = tt->num_pages;
> > > +
> > > +	if (!add)
> > > +		num_pages = -num_pages;
> > > +
> > > +	if (xe_tt->purgeable)
> > > +		xe_shrinker_mod_pages(xe_tt->xe->mem.shrinker, 0,
> > > num_pages);
> > > +	else
> > > +		xe_shrinker_mod_pages(xe_tt->xe->mem.shrinker,
> > > num_pages, 0);
> > > +}
> > > +
> > >  static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object
> > > *ttm_bo,
> > >  				       u32 page_flags)
> > >  {
> > >  	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
> > >  	struct xe_device *xe = xe_bo_device(bo);
> > > -	struct xe_ttm_tt *tt;
> > > +	struct xe_ttm_tt *xe_tt;
> > > +	struct ttm_tt *tt;
> > >  	unsigned long extra_pages;
> > >  	enum ttm_caching caching;
> > >  	int err;
> > >  
> > > -	tt = kzalloc(sizeof(*tt), GFP_KERNEL);
> > > -	if (!tt)
> > > +	xe_tt = kzalloc(sizeof(*xe_tt), GFP_KERNEL);
> > > +	if (!xe_tt)
> > >  		return NULL;
> > >  
> > > -	tt->dev = xe->drm.dev;
> > > +	tt = &xe_tt->ttm;
> > > +	xe_tt->xe = xe;
> > >  
> > >  	extra_pages = 0;
> > >  	if (xe_bo_needs_ccs_pages(bo))
> > > @@ -387,42 +414,128 @@ static struct ttm_tt
> > > *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo,
> > >  		caching = ttm_uncached;
> > >  	}
> > >  
> > > -	err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching,
> > > extra_pages);
> > > +	if (ttm_bo->type != ttm_bo_type_sg)
> > > +		page_flags |= TTM_TT_FLAG_EXTERNAL |
> > > TTM_TT_FLAG_EXTERNAL_MAPPABLE;
> > > +
> > > +	err = ttm_tt_init(tt, &bo->ttm, page_flags, caching,
> > > extra_pages);
> > >  	if (err) {
> > > -		kfree(tt);
> > > +		kfree(xe_tt);
> > >  		return NULL;
> > >  	}
> > >  
> > > -	return &tt->ttm;
> > > +	tt->backup = ttm_backup_shmem_create(tt->num_pages <<
> > > PAGE_SHIFT);
> > > +	if (IS_ERR(tt->backup)) {
> > > +		ttm_tt_fini(tt);
> > 
> > Mentioned this the previous review I think you need set tt->backup to
> > NULL here or update ttm_tt_fini to understand IS_ERR(tt->backup).
> > 
> > Also maybe dump question but could we just have a global backup for
> > all
> > BOs? Would that be better than each BO creating its own backup?
> 
> I initially made the code like that, when we had a global backend
> directly into the swap cache. But I figure shmem wants one file per bo,
> otherwise we'd need one gigantic shmem object... Not sure if that
> works...
>

If one backup per shmem sense, then let's do that.

I do think we should remove the restriction of only supporting 1 backup
per BO so if a driver wants to use a global backup it can.

e.g. Don't fini / free the backup in ttm_tt_fini rather have the driver
completely own the backup field. Kinda goofy for the driver to setup the
field but TTM to fini / free it.

> > 
> > > +		kfree(xe_tt);
> > > +		return NULL;
> > > +	}
> > > +
> > > +	return tt;
> > >  }
> > >  
> > >  static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct
> > > ttm_tt *tt,
> > >  			      struct ttm_operation_ctx *ctx)
> > >  {
> > > +	struct xe_ttm_tt *xe_tt = container_of(tt, struct
> > > xe_ttm_tt, ttm);
> > >  	int err;
> > >  
> > >  	/*
> > >  	 * dma-bufs are not populated with pages, and the dma-
> > >  	 * addresses are set up when moved to XE_PL_TT.
> > >  	 */
> > > -	if (tt->page_flags & TTM_TT_FLAG_EXTERNAL)
> > > +	if ((tt->page_flags & TTM_TT_FLAG_EXTERNAL) &&
> > > +	    !(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE))
> > >  		return 0;
> > >  
> > >  	err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx);
> > >  	if (err)
> > >  		return err;
> > >  
> > > -	return err;
> > > +	xe_tt->purgeable = false;
> > > +	xe_ttm_tt_account(tt, true);
> > > +
> > > +	return 0;
> > >  }
> > >  
> > >  static void xe_ttm_tt_unpopulate(struct ttm_device *ttm_dev,
> > > struct ttm_tt *tt)
> > >  {
> > > -	if (tt->page_flags & TTM_TT_FLAG_EXTERNAL)
> > > +	if ((tt->page_flags & TTM_TT_FLAG_EXTERNAL) &&
> > > +	    !(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE))
> > >  		return;
> > >  
> > >  	xe_tt_unmap_sg(tt);
> > >  
> > > -	return ttm_pool_free(&ttm_dev->pool, tt);
> > > +	ttm_pool_free(&ttm_dev->pool, tt);
> > > +	xe_ttm_tt_account(tt, false);
> > > +}
> > > +
> > > +/**
> > > + * xe_bo_shrink() - Try to shrink an xe bo.
> > > + * @walk:  - The walk parameters
> > > + * @bo: The TTM buffer object
> > > + * @purge: Only consider purgeable bos.
> > > + * @writeback: Try to write back to persistent storage.
> > > + *
> > > + * Try to shrink- or purge a bo, and if it succeeds, unmap dma.
> > > + * Note that we need to be able to handle also non xe bos
> > > + * (ghost bos), but only if the struct ttm_tt is embedded in
> > > + * a struct xe_ttm_tt.
> > > + *
> > > + * Return: The number of pages shrunken or purged, or negative
> > > error
> > > + * code on failure.
> > > + */
> > > +long xe_bo_shrink(struct ttm_lru_walk *walk, struct
> > > ttm_buffer_object *bo,
> > > +		  bool purge, bool writeback)
> > > +{
> > > +	struct ttm_tt *tt = bo->ttm;
> > > +	struct xe_ttm_tt *xe_tt = container_of(tt, struct
> > > xe_ttm_tt, ttm);
> > > +	struct ttm_place place = {.mem_type = bo->resource-
> > > >mem_type};
> > > +	struct xe_bo *xe_bo = ttm_to_xe_bo(bo);
> > > +	struct xe_device *xe = xe_tt->xe;
> > > +	bool needs_rpm;
> > > +	long lret = 0L;
> > > +
> > > +	if (!tt || !ttm_tt_is_populated(tt) ||
> > > +	    !(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE) ||
> > > +	    (purge && !xe_tt->purgeable))
> > > +		return 0L;
> > > +
> > > +	if (!ttm_bo_eviction_valuable(bo, &place))
> > > +		return 0L;
> > > +
> > > +	/* Beware of zombies (GEM object refcount == 0) and
> > > ghosts. */
> > > +	if (!xe_bo_is_xe_bo(bo) || !xe_bo_get_unless_zero(xe_bo))
> > > {
> > > +		struct ttm_placement null_placement = {
> > > .num_placement = 0 };
> > > +
> > > +		lret = ttm_bo_wait_ctx(bo, walk->ctx);
> > > +		if (lret)
> > > +			return lret;
> > > +
> > > +		/* Purge the bo content! */
> > > +		ttm_bo_validate(bo, &null_placement, walk->ctx);
> > > +		return tt->num_pages;
> > > +	}
> > > +
> > > +	/* System CCS needs gpu copy when moving PL_TT ->
> > > PL_SYSTEM */
> > > +	needs_rpm = (!IS_DGFX(xe) && bo->resource->mem_type !=
> > > XE_PL_SYSTEM &&
> > > +		     xe_bo && xe_bo_needs_ccs_pages(xe_bo) &&
> > > !xe_tt->purgeable);
> > 
> > Is xe_bo check really needed here?
> 
> Yes, I think otherwise xe_bo_needs_ccs_pages will be called with NULL
> for ghost objects that aren't xe_bo.
> 

Got it.

> > 
> > > +	if (needs_rpm && !xe_pm_runtime_get_if_active(xe))
> > > +		goto out_unref;
> > > +
> > > +	lret = ttm_bo_try_shrink(walk, bo, xe_tt->purgeable,
> > > writeback);
> > > +	if (needs_rpm)
> > > +		xe_pm_runtime_put(xe);
> > > +
> > > +	if (lret > 0) {
> > > +		xe_assert(xe, !ttm_tt_is_populated(tt));
> > > +
> > > +		xe_ttm_tt_account(tt, false);
> > > +	}
> > > +
> > > +out_unref:
> > > +	xe_bo_put(xe_bo);
> > > +
> > > +	return lret;
> > >  }
> > >  
> > >  static void xe_ttm_tt_destroy(struct ttm_device *ttm_dev, struct
> > > ttm_tt *tt)
> > > @@ -1238,6 +1351,7 @@ struct xe_bo *___xe_bo_create_locked(struct
> > > xe_device *xe, struct xe_bo *bo,
> > >  	struct ttm_operation_ctx ctx = {
> > >  		.interruptible = true,
> > >  		.no_wait_gpu = false,
> > > +		.gfp_retry_mayfail = true,
> > 
> > Can you explain why you are setting this?
> 
> Hm. We might want this in a separate patch. Without this, bo memory
> allocation will typically never fail, but instead start the OOM killer.
> I don't think that's the behaviour we want, but yeah deserves its own
> patch and review.
> 

Sounds good.

> > 
> > >  	};
> > >  	struct ttm_placement *placement;
> > >  	uint32_t alignment;
> > > @@ -1681,6 +1795,8 @@ int xe_bo_pin_external(struct xe_bo *bo)
> > >  	}
> > >  
> > >  	ttm_bo_pin(&bo->ttm);
> > > +	if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm))
> > > +		xe_ttm_tt_account(bo->ttm.ttm, false);
> > >  
> > >  	/*
> > >  	 * FIXME: If we always use the reserve / unreserve
> > > functions for locking
> > > @@ -1739,6 +1855,8 @@ int xe_bo_pin(struct xe_bo *bo)
> > >  	}
> > >  
> > >  	ttm_bo_pin(&bo->ttm);
> > > +	if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm))
> > > +		xe_ttm_tt_account(bo->ttm.ttm, false);
> > >  
> > >  	/*
> > >  	 * FIXME: If we always use the reserve / unreserve
> > > functions for locking
> > > @@ -1773,6 +1891,9 @@ void xe_bo_unpin_external(struct xe_bo *bo)
> > >  	spin_unlock(&xe->pinned.lock);
> > >  
> > >  	ttm_bo_unpin(&bo->ttm);
> > > +	if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm))
> > > +		xe_ttm_tt_account(bo->ttm.ttm, true);
> > > +
> > 
> > Nit: Extra newline.
> Will fix.
> > 
> > >  
> > >  	/*
> > >  	 * FIXME: If we always use the reserve / unreserve
> > > functions for locking
> > > @@ -1801,6 +1922,8 @@ void xe_bo_unpin(struct xe_bo *bo)
> > >  	}
> > >  
> > >  	ttm_bo_unpin(&bo->ttm);
> > > +	if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm))
> > > +		xe_ttm_tt_account(bo->ttm.ttm, true);
> > >  }
> > >  
> > >  /**
> > > diff --git a/drivers/gpu/drm/xe/xe_bo.h
> > > b/drivers/gpu/drm/xe/xe_bo.h
> > > index 6de894c728f5..8463e3f3f6f1 100644
> > > --- a/drivers/gpu/drm/xe/xe_bo.h
> > > +++ b/drivers/gpu/drm/xe/xe_bo.h
> > > @@ -63,6 +63,7 @@
> > >  #define XE_BO_PROPS_INVALID	(-1)
> > >  
> > >  struct sg_table;
> > > +struct xe_ttm_lru_walk;
> > >  
> > >  struct xe_bo *xe_bo_alloc(void);
> > >  void xe_bo_free(struct xe_bo *bo);
> > > @@ -126,6 +127,28 @@ static inline struct xe_bo *xe_bo_get(struct
> > > xe_bo *bo)
> > >  	return bo;
> > >  }
> > >  
> > > +/*
> > > + * xe_bo_get_unless_zero() - Conditionally obtain a GEM object
> > > refcount on an
> > > + * xe bo
> > > + * @bo: The bo for which we want to obtain a refcount.
> > > + *
> > > + * There is a short window between where the bo's GEM object
> > > refcount reaches
> > > + * zero and where we put the final ttm_bo reference. Code in the
> > > eviction- and
> > > + * shrinking path should therefore attempt to grab a gem object
> > > reference before
> > > + * trying to use members outside of the base class ttm object.
> > > This function is
> > > + * intended for that purpose. On successful return, this function
> > > must be paired
> > > + * with an xe_bo_put().
> > > + *
> > > + * Return: @bo on success, NULL on failure.
> > > + */
> > > +static inline __must_check struct xe_bo
> > > *xe_bo_get_unless_zero(struct xe_bo *bo)
> > > +{
> > > +	if (!bo || !kref_get_unless_zero(&bo->ttm.base.refcount))
> > > +		return NULL;
> > > +
> > > +	return bo;
> > > +}
> > > +
> > >  static inline void xe_bo_put(struct xe_bo *bo)
> > >  {
> > >  	if (bo)
> > > @@ -315,6 +338,9 @@ static inline unsigned int
> > > xe_sg_segment_size(struct device *dev)
> > >  
> > >  #define
> > > i915_gem_object_flush_if_display(obj)		((void)(obj))
> > >  
> > > +long xe_bo_shrink(struct ttm_lru_walk *walk, struct
> > > ttm_buffer_object *bo,
> > > +		  bool purge, bool writeback);
> > > +
> > >  #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
> > >  /**
> > >   * xe_bo_is_mem_type - Whether the bo currently resides in the
> > > given
> > > diff --git a/drivers/gpu/drm/xe/xe_device.c
> > > b/drivers/gpu/drm/xe/xe_device.c
> > > index cfda7cb5df2c..58fecc4b0a18 100644
> > > --- a/drivers/gpu/drm/xe/xe_device.c
> > > +++ b/drivers/gpu/drm/xe/xe_device.c
> > > @@ -47,6 +47,7 @@
> > >  #include "xe_perf.h"
> > >  #include "xe_pm.h"
> > >  #include "xe_query.h"
> > > +#include "xe_shrinker.h"
> > >  #include "xe_sriov.h"
> > >  #include "xe_tile.h"
> > >  #include "xe_ttm_stolen_mgr.h"
> > > @@ -241,6 +242,9 @@ static void xe_device_destroy(struct drm_device
> > > *dev, void *dummy)
> > >  	if (xe->unordered_wq)
> > >  		destroy_workqueue(xe->unordered_wq);
> > >  
> > > +	if (!IS_ERR_OR_NULL(xe->mem.shrinker))
> > > +		xe_shrinker_destroy(xe->mem.shrinker);
> > > +
> > >  	ttm_device_fini(&xe->ttm);
> > >  }
> > >  
> > > @@ -270,6 +274,10 @@ struct xe_device *xe_device_create(struct
> > > pci_dev *pdev,
> > >  	if (err)
> > >  		goto err;
> > >  
> > > +	xe->mem.shrinker = xe_shrinker_create(xe);
> > > +	if (IS_ERR(xe->mem.shrinker))
> > > +		return ERR_CAST(xe->mem.shrinker);
> > > +
> > >  	xe->info.devid = pdev->device;
> > >  	xe->info.revid = pdev->revision;
> > >  	xe->info.force_execlist = xe_modparam.force_execlist;
> > > diff --git a/drivers/gpu/drm/xe/xe_device_types.h
> > > b/drivers/gpu/drm/xe/xe_device_types.h
> > > index c37be471d11c..3d5440aba52e 100644
> > > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > > @@ -325,6 +325,8 @@ struct xe_device {
> > >  		struct xe_mem_region vram;
> > >  		/** @mem.sys_mgr: system TTM manager */
> > >  		struct ttm_resource_manager sys_mgr;
> > > +		/** @mem.sys_mgr: system memory shrinker. */
> > > +		struct xe_shrinker *shrinker;
> > >  	} mem;
> > >  
> > >  	/** @sriov: device level virtualization data */
> > > diff --git a/drivers/gpu/drm/xe/xe_shrinker.c
> > > b/drivers/gpu/drm/xe/xe_shrinker.c
> > > new file mode 100644
> > > index 000000000000..3f9554bdc06b
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/xe_shrinker.c
> > > @@ -0,0 +1,287 @@
> > > +// SPDX-License-Identifier: MIT
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + */
> > > +
> > > +#include <linux/shrinker.h>
> > > +#include <linux/swap.h>
> > > +
> > > +#include <drm/ttm/ttm_bo.h>
> > > +#include <drm/ttm/ttm_tt.h>
> > > +
> > > +#include "xe_bo.h"
> > > +#include "xe_pm.h"
> > > +#include "xe_shrinker.h"
> > > +
> > > +/**
> > > + * struct xe_shrinker - per-device shrinker
> > > + * @xe: Back pointer to the device.
> > > + * @lock: Lock protecting accounting.
> > > + * @shrinkable_pages: Number of pages that are currently
> > > shrinkable.
> > > + * @purgeable_pages: Number of pages that are currently purgeable.
> > > + * @shrink: Pointer to the mm shrinker.
> > > + * @pm_worker: Worker to wake up the device if required.
> > > + */
> > > +struct xe_shrinker {
> > > +	struct xe_device *xe;
> > > +	rwlock_t lock;
> > > +	long shrinkable_pages;
> > > +	long purgeable_pages;
> > > +	struct shrinker *shrink;
> > > +	struct work_struct pm_worker;
> > > +};
> > > +
> > > +/**
> > > + * struct xe_shrink_lru_walk - lru_walk subclass for shrinker
> > > + * @walk: The embedded base class.
> > > + * @xe: Pointer to the xe device.
> > > + * @purge: Purgeable only request from the srinker.
> > > + * @writeback: Try to write back to persistent storage.
> > > + */
> > > +struct xe_shrink_lru_walk {
> > > +	struct ttm_lru_walk walk;
> > > +	struct xe_device *xe;
> > > +	bool purge;
> > > +	bool writeback;
> > > +};
> > > +
> > > +static struct xe_shrinker *to_xe_shrinker(struct shrinker *shrink)
> > > +{
> > > +	return shrink->private_data;
> > > +}
> > > +
> > > +static struct xe_shrink_lru_walk *
> > > +to_xe_shrink_lru_walk(struct ttm_lru_walk *walk)
> > > +{
> > > +	return container_of(walk, struct xe_shrink_lru_walk,
> > > walk);
> > > +}
> > > +
> > > +/**
> > > + * xe_shrinker_mod_pages() - Modify shrinker page accounting
> > > + * @shrinker: Pointer to the struct xe_shrinker.
> > > + * @shrinkable: Shrinkable pages delta. May be negative.
> > > + * @purgeable: Purgeable page delta. May be negative.
> > > + *
> > > + * Modifies the shrinkable and purgeable pages accounting.
> > > + */
> > > +void
> > > +xe_shrinker_mod_pages(struct xe_shrinker *shrinker, long
> > > shrinkable, long purgeable)
> > > +{
> > > +	write_lock(&shrinker->lock);
> > > +	shrinker->shrinkable_pages += shrinkable;
> > > +	shrinker->purgeable_pages += purgeable;
> > > +	write_unlock(&shrinker->lock);
> > > +}
> > > +
> > > +static long xe_shrinker_process_bo(struct ttm_lru_walk *walk,
> > > struct ttm_buffer_object *bo)
> > > +{
> > > +	struct xe_shrink_lru_walk *shrink_walk =
> > > to_xe_shrink_lru_walk(walk);
> > > +
> > > +	return xe_bo_shrink(walk, bo, shrink_walk->purge,
> > > shrink_walk->writeback);
> > > +}
> > > +
> > > +static long xe_shrinker_walk(struct xe_shrink_lru_walk
> > > *shrink_walk, long target)
> > > +{
> > > +	struct xe_device *xe = shrink_walk->xe;
> > > +	struct ttm_resource_manager *man;
> > > +	unsigned int mem_type;
> > > +	long sofar = 0;
> > > +	long lret;
> > > +
> > > +	for (mem_type = XE_PL_SYSTEM; mem_type <= XE_PL_TT;
> > > ++mem_type) {
> > > +		man = ttm_manager_type(&xe->ttm, mem_type);
> > > +		if (!man || !man->use_tt)
> > > +			continue;
> > > +
> > > +		lret = ttm_lru_walk_for_evict(&shrink_walk->walk,
> > > &xe->ttm, man, target);
> > > +		if (lret < 0)
> > > +			return lret;
> > > +
> > > +		sofar += lret;
> > > +		if (sofar >= target)
> > > +			break;
> > > +	}
> > > +
> > > +	return sofar;
> > > +}
> > > +
> > > +static unsigned long
> > > +xe_shrinker_count(struct shrinker *shrink, struct shrink_control
> > > *sc)
> > > +{
> > > +	struct xe_shrinker *shrinker = to_xe_shrinker(shrink);
> > > +	unsigned long num_pages;
> > > +
> > > +	num_pages = get_nr_swap_pages();
> > > +	read_lock(&shrinker->lock);
> > > +	num_pages = min_t(unsigned long, num_pages, shrinker-
> > > >shrinkable_pages);
> > > +	num_pages += shrinker->purgeable_pages;
> > > +	read_unlock(&shrinker->lock);
> > > +
> > > +	return num_pages ? num_pages : SHRINK_EMPTY;
> > > +}
> > > +
> > > +static const struct ttm_lru_walk_ops xe_shrink_ops = {
> > > +	.process_bo = xe_shrinker_process_bo,
> > > +};
> > > +
> > > +/*
> > > + * Check if we need runtime pm, and if so try to grab a reference
> > > if
> > > + * already active. If grabbing a reference fails, queue a worker
> > > that
> > > + * does it for us outside of reclaim, but don't wait for it to
> > > complete.
> > > + * If bo shrinking needs an rpm reference and we don't have it
> > > (yet),
> > > + * that bo will be skipped anyway.
> > > + */
> > > +static bool xe_shrinker_runtime_pm_get(struct xe_shrinker
> > > *shrinker, bool force,
> > > +				       unsigned long nr_to_scan)
> > > +{
> > > +	struct xe_device *xe = shrinker->xe;
> > > +
> > > +	if (IS_DGFX(xe) || !xe_device_has_flat_ccs(xe) ||
> > > +	    !get_nr_swap_pages())
> > > +		return false;
> > > +
> > > +	if (!force) {
> > > +		read_lock(&shrinker->lock);
> > > +		force = (nr_to_scan > shrinker->purgeable_pages);
> > > +		read_unlock(&shrinker->lock);
> > > +		if (!force)
> > > +			return false;
> > > +	}
> > > +
> > > +	if (!xe_pm_runtime_get_if_active(xe)) {
> > > +		queue_work(xe->unordered_wq, &shrinker-
> > > >pm_worker);
> > > +		return false;
> > > +	}
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +static void xe_shrinker_runtime_pm_put(struct xe_shrinker
> > > *shrinker, bool runtime_pm)
> > > +{
> > > +	if (runtime_pm)
> > > +		xe_pm_runtime_put(shrinker->xe);
> > > +}
> > > +
> > > +static unsigned long xe_shrinker_scan(struct shrinker *shrink,
> > > struct shrink_control *sc)
> > > +{
> > > +	struct xe_shrinker *shrinker = to_xe_shrinker(shrink);
> > > +	bool is_kswapd = current_is_kswapd();
> > > +	struct ttm_operation_ctx ctx = {
> > > +		.interruptible = false,
> > > +		.no_wait_gpu = !is_kswapd,
> > > +	};
> > > +	unsigned long nr_to_scan, freed = 0;
> > > +	struct xe_shrink_lru_walk shrink_walk = {
> > > +		.walk = {
> > > +			.ops = &xe_shrink_ops,
> > > +			.ctx = &ctx,
> > > +			.trylock_only = true,
> > > +		},
> > > +		.xe = shrinker->xe,
> > > +		.purge = true,
> > > +		.writeback = is_kswapd,
> > > +	};
> > > +	bool runtime_pm;
> > > +	bool purgeable;
> > > +	long ret;
> > > +
> > > +	sc->nr_scanned = 0;
> > > +	nr_to_scan = sc->nr_to_scan;
> > > +
> > > +	read_lock(&shrinker->lock);
> > > +	purgeable = !!shrinker->purgeable_pages;
> > > +	read_unlock(&shrinker->lock);
> > > +
> > > +	/* Might need runtime PM. Try to wake early if it looks
> > > like it. */
> > > +	runtime_pm = xe_shrinker_runtime_pm_get(shrinker, false,
> > > nr_to_scan);
> > > +
> > > +	while (purgeable && freed < nr_to_scan) {
> > > +		ret = xe_shrinker_walk(&shrink_walk, nr_to_scan);
> > > +		if (ret <= 0)
> > > +			break;
> > > +
> > > +		freed += ret;
> > > +	}
> > > +
> > > +	sc->nr_scanned = freed;
> > > +	if (freed < nr_to_scan)
> > > +		nr_to_scan -= freed;
> > > +	else
> > > +		nr_to_scan = 0;
> > > +	if (!nr_to_scan)
> > > +		goto out;
> > > +
> > > +	/* If we didn't wake before, try to do it now if needed.
> > > */
> > > +	if (!runtime_pm)
> > > +		runtime_pm = xe_shrinker_runtime_pm_get(shrinker,
> > > true, 0);
> > > +
> > > +	shrink_walk.purge = false;
> > > +	nr_to_scan = sc->nr_to_scan;
> > > +	while (freed < nr_to_scan) {
> > > +		ret = xe_shrinker_walk(&shrink_walk, nr_to_scan);
> > > +		if (ret <= 0)
> > > +			break;
> > > +
> > > +		freed += ret;
> > > +	}
> > > +
> > > +	sc->nr_scanned = freed;
> > > +
> > > +out:
> > > +	xe_shrinker_runtime_pm_put(shrinker, runtime_pm);
> > > +	return freed ? freed : SHRINK_STOP;
> > > +}
> > > +
> > > +/* Wake up the device for shrinking. */
> > > +static void xe_shrinker_pm(struct work_struct *work)
> > > +{
> > > +	struct xe_shrinker *shrinker =
> > > +		container_of(work, typeof(*shrinker), pm_worker);
> > > +
> > > +	xe_pm_runtime_get(shrinker->xe);
> > > +	xe_pm_runtime_put(shrinker->xe);
> > 
> > So I don't really get this. How does this help the shrinker get a PM
> > ref? The small window between xe_pm_runtime_get / put the shrinker
> > grabs
> > one via xe_pm_runtime_get_if_active? Seems fragile.
> 
> The pm has a delay after the last put (Typically 1s?), and if the
> shrinker fails obtaining a reference it will be retried. Typically it
> will then succeed and when it does start a new delay.
> 

Ah, forgot that their is a delay on the final put. This make sense now.
Can't really think of a better solution that this at the moment. Maybe
add a comment indicating this?

Matt

> This doesn't strictly guarantee shrinking progress, but there's no way
> we can synchronously delay shrinking until we have the pm reference.
> 
> What we could perhaps do is to report back that we actually made
> progress but postpone the actual shrinking to the worker with writeback
> set to on; hoping that the shrinking core doesn't give up before it
> sees the released memory...
> 
> But I'd prefer to leave that out until we see that that's actually a
> real problem.
> 
> Best would be if we could have access the LNL CCS memory using the
> CPU...
> 
> /Thomas
> 
> 
> > 
> > Matt
> > 
> > > +}
> > > +
> > > +/**
> > > + * xe_shrinker_create() - Create an xe per-device shrinker
> > > + * @xe: Pointer to the xe device.
> > > + *
> > > + * Returns: A pointer to the created shrinker on success,
> > > + * Negative error code on failure.
> > > + */
> > > +struct xe_shrinker *xe_shrinker_create(struct xe_device *xe)
> > > +{
> > > +	struct xe_shrinker *shrinker = kzalloc(sizeof(*shrinker),
> > > GFP_KERNEL);
> > > +
> > > +	if (!shrinker)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	shrinker->shrink = shrinker_alloc(0, "xe system
> > > shrinker");
> > > +	if (!shrinker->shrink) {
> > > +		kfree(shrinker);
> > > +		return ERR_PTR(-ENOMEM);
> > > +	}
> > > +
> > > +	INIT_WORK(&shrinker->pm_worker, xe_shrinker_pm);
> > > +	shrinker->xe = xe;
> > > +	rwlock_init(&shrinker->lock);
> > > +	shrinker->shrink->count_objects = xe_shrinker_count;
> > > +	shrinker->shrink->scan_objects = xe_shrinker_scan;
> > > +	shrinker->shrink->private_data = shrinker;
> > > +	shrinker_register(shrinker->shrink);
> > > +
> > > +	return shrinker;
> > > +}
> > > +
> > > +/**
> > > + * xe_shrinker_destroy() - Destroy an xe per-device shrinker
> > > + * @shrinker: Pointer to the shrinker to destroy.
> > > + */
> > > +void xe_shrinker_destroy(struct xe_shrinker *shrinker)
> > > +{
> > > +	xe_assert(shrinker->xe, !shrinker->shrinkable_pages);
> > > +	xe_assert(shrinker->xe, !shrinker->purgeable_pages);
> > > +	shrinker_free(shrinker->shrink);
> > > +	flush_work(&shrinker->pm_worker);
> > > +	kfree(shrinker);
> > > +}
> > > diff --git a/drivers/gpu/drm/xe/xe_shrinker.h
> > > b/drivers/gpu/drm/xe/xe_shrinker.h
> > > new file mode 100644
> > > index 000000000000..28a038f4fcbf
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/xe_shrinker.h
> > > @@ -0,0 +1,18 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2024 Intel Corporation
> > > + */
> > > +
> > > +#ifndef _XE_SHRINKER_H_
> > > +#define _XE_SHRINKER_H_
> > > +
> > > +struct xe_shrinker;
> > > +struct xe_device;
> > > +
> > > +void xe_shrinker_mod_pages(struct xe_shrinker *shrinker, long
> > > shrinkable, long purgeable);
> > > +
> > > +struct xe_shrinker *xe_shrinker_create(struct xe_device *xe);
> > > +
> > > +void xe_shrinker_destroy(struct xe_shrinker *shrinker);
> > > +
> > > +#endif
> > > diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> > > index e577528f5dfc..c7e81ae025d9 100644
> > > --- a/include/drm/ttm/ttm_bo.h
> > > +++ b/include/drm/ttm/ttm_bo.h
> > > @@ -229,6 +229,9 @@ struct ttm_lru_walk {
> > >  long ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct
> > > ttm_device *bdev,
> > >  			    struct ttm_resource_manager *man, long
> > > target);
> > >  
> > > +long ttm_bo_try_shrink(struct ttm_lru_walk *walk, struct
> > > ttm_buffer_object *bo,
> > > +		       bool purge, bool writeback);
> > > +
> > >  /**
> > >   * ttm_bo_get - reference a struct ttm_buffer_object
> > >   *
> > > -- 
> > > 2.44.0
> > > 
>