On Fri, Aug 09, 2024 at 04:31:27PM +0200, Thomas Hellström wrote: > On Thu, 2024-08-08 at 01:37 +0000, Matthew Brost wrote: > > On Wed, Jul 03, 2024 at 05:38:12PM +0200, Thomas Hellström wrote: > > > Rather than relying on the TTM watermark accounting add a shrinker > > > for xe_bos in TT or system memory. > > > > > > Leverage the newly added TTM per-page shrinking and shmem backup > > > support. > > > > > > Although xe doesn't fully support WONTNEED (purgeable) bos yet, > > > introduce and add shrinker support for purgeable ttm_tts. > > > > > > v2: > > > - Cleanups bugfixes and a KUNIT shrinker test. > > > - Add writeback support, and activate if kswapd. > > > v3: > > > - Move the try_shrink() helper to core TTM. > > > - Minor cleanups. > > > v4: > > > - Add runtime pm for the shrinker. Shrinking may require an active > > > device for CCS metadata copying. > > > v5: > > > - Separately purge ghost- and zombie objects in the shrinker. > > > - Fix a format specifier - type inconsistency. (Kernel test robot). > > > > > > Cc: Christian König <christian.koenig@xxxxxxx> > > > Cc: Somalapuram Amaranath <Amaranath.Somalapuram@xxxxxxx> > > > Cc: Matthew Brost <matthew.brost@xxxxxxxxx> > > > Cc: <dri-devel@xxxxxxxxxxxxxxxxxxxxx> > > > Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> > > > --- > > > drivers/gpu/drm/ttm/ttm_bo_util.c | 67 ++++++ > > > drivers/gpu/drm/xe/Makefile | 1 + > > > drivers/gpu/drm/xe/tests/xe_bo.c | 118 +++++++++++ > > > drivers/gpu/drm/xe/tests/xe_bo_test.c | 1 + > > > drivers/gpu/drm/xe/tests/xe_bo_test.h | 1 + > > > drivers/gpu/drm/xe/xe_bo.c | 155 ++++++++++++-- > > > drivers/gpu/drm/xe/xe_bo.h | 26 +++ > > > drivers/gpu/drm/xe/xe_device.c | 8 + > > > drivers/gpu/drm/xe/xe_device_types.h | 2 + > > > drivers/gpu/drm/xe/xe_shrinker.c | 287 > > > ++++++++++++++++++++++++++ > > > drivers/gpu/drm/xe/xe_shrinker.h | 18 ++ > > > include/drm/ttm/ttm_bo.h | 3 + > > > 12 files changed, 671 insertions(+), 16 deletions(-) > > > create mode 100644 drivers/gpu/drm/xe/xe_shrinker.c > > > create mode 100644 drivers/gpu/drm/xe/xe_shrinker.h > > > > > > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c > > > b/drivers/gpu/drm/ttm/ttm_bo_util.c > > > index c4f678f30fc2..563e96a4cf06 100644 > > > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c > > > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c > > > @@ -924,3 +924,70 @@ long ttm_lru_walk_for_evict(struct > > > ttm_lru_walk *walk, struct ttm_device *bdev, > > > > > > return progress; > > > } > > > +EXPORT_SYMBOL(ttm_lru_walk_for_evict); > > > + > > > +/** > > > + * ttm_bo_try_shrink - LRU walk helper to shrink a ttm buffer > > > object. > > > + * @walk: The struct xe_ttm_lru_walk that describes the walk. > > > + * @bo: The buffer object. > > > + * @purge: Whether to attempt to purge the bo content since it's > > > no > > > + * longer needed. > > > + * @writeback: If !@purge, attempt to write out to persistent > > > storage. > > > + * > > > + * The function uses the ttm_tt_back_up functionality to back up > > > or > > > + * purge a struct ttm_tt. If the bo is not in system, it's first > > > + * moved there. > > > + * > > > + * Return: The number of pages shrunken or purged, or > > > + * negative error code on failure. > > > + */ > > > +long ttm_bo_try_shrink(struct ttm_lru_walk *walk, struct > > > ttm_buffer_object *bo, > > > + bool purge, bool writeback) > > > +{ > > > + static const struct ttm_place sys_placement_flags = { > > > + .fpfn = 0, > > > + .lpfn = 0, > > > + .mem_type = TTM_PL_SYSTEM, > > > + .flags = 0, > > > + }; > > > + static struct ttm_placement sys_placement = { > > > + .num_placement = 1, > > > + .placement = &sys_placement_flags, > > > + }; > > > + struct ttm_operation_ctx *ctx = walk->ctx; > > > + struct ttm_tt *tt = bo->ttm; > > > + long lret; > > > + > > > + dma_resv_assert_held(bo->base.resv); > > > + > > > + if (!tt || !ttm_tt_is_populated(tt)) > > > + return 0; > > > + > > > + if (bo->resource->mem_type != TTM_PL_SYSTEM) { > > > + int ret = ttm_bo_validate(bo, &sys_placement, > > > ctx); > > > + > > > + if (ret) { > > > + if (ret == -EINTR || ret == -EDEADLK || > > > + ret == -ERESTARTSYS) > > > + return ret; > > > > Can you explain the various error code returns / supression in this > > function? > > Want me to add a comment in the code or inline here? Anyway, the error > codes are codes for which the caller wants to restart. (Signal delivery > or deadlock). For other errors just move on to the next bo on the LRU > list. > Yes, that would help make this clear and help ensure correctness. > > > > > + return 0; > > > + } > > > + } > > > + > > > + lret = ttm_bo_wait_ctx(bo, ctx); > > > + if (lret < 0) { > > > + if (lret == -ERESTARTSYS) > > > + return lret; > > > + return 0; > > > + } > > > + > > > + if (bo->deleted) > > > + lret = ttm_tt_backup(bo->bdev, tt, true, > > > writeback); > > > + else > > > + lret = ttm_tt_backup(bo->bdev, tt, purge, > > > writeback); > > > > Hmm, missed this in my previous review. It is frowned upon having > > multiple bools as arguments. Could this be reworked with flags? Same > > goes for all functions in the series with multiple bool arguments. > > I agree. Ill see if I can make this look betteer. > +1 > > > > > + if (lret < 0 && lret != -EINTR) > > > + return 0; > > > + > > > + return lret; > > > +} > > > +EXPORT_SYMBOL(ttm_bo_try_shrink); > > > diff --git a/drivers/gpu/drm/xe/Makefile > > > b/drivers/gpu/drm/xe/Makefile > > > index b1e03bfe4a68..1eba51bdd172 100644 > > > --- a/drivers/gpu/drm/xe/Makefile > > > +++ b/drivers/gpu/drm/xe/Makefile > > > @@ -112,6 +112,7 @@ xe-y += xe_bb.o \ > > > xe_ring_ops.o \ > > > xe_sa.o \ > > > xe_sched_job.o \ > > > + xe_shrinker.o \ > > > xe_step.o \ > > > xe_sync.o \ > > > xe_tile.o \ > > > diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c > > > b/drivers/gpu/drm/xe/tests/xe_bo.c > > > index 9f3c02826464..49617f16dc76 100644 > > > --- a/drivers/gpu/drm/xe/tests/xe_bo.c > > > +++ b/drivers/gpu/drm/xe/tests/xe_bo.c > > > @@ -6,6 +6,8 @@ > > > #include <kunit/test.h> > > > #include <kunit/visibility.h> > > > > > > +#include <uapi/linux/sysinfo.h> > > > + > > > #include "tests/xe_bo_test.h" > > > #include "tests/xe_pci_test.h" > > > #include "tests/xe_test.h" > > > @@ -350,3 +352,119 @@ void xe_bo_evict_kunit(struct kunit *test) > > > xe_call_for_each_device(evict_test_run_device); > > > } > > > EXPORT_SYMBOL_IF_KUNIT(xe_bo_evict_kunit); > > > + > > > +struct xe_bo_link { > > > + struct list_head link; > > > + struct xe_bo *bo; > > > +}; > > > + > > > +#define XE_BO_SHRINK_SIZE ((unsigned long)SZ_64M) > > > + > > > +/* > > > + * Try to create system bos corresponding to twice the amount > > > + * of available system memory to test shrinker functionality. > > > + * If no swap space is available to accommodate the > > > + * memory overcommit, mark bos purgeable. > > > + */ > > > +static int shrink_test_run_device(struct xe_device *xe) > > > +{ > > > + struct kunit *test = xe_cur_kunit(); > > > + LIST_HEAD(bos); > > > + struct xe_bo_link *link, *next; > > > + struct sysinfo si; > > > + size_t total, alloced; > > > + unsigned int interrupted = 0, successful = 0; > > > + > > > + si_meminfo(&si); > > > + total = si.freeram * si.mem_unit; > > > + > > > + kunit_info(test, "Free ram is %lu bytes. Will allocate > > > twice of that.\n", > > > + (unsigned long) total); > > > + > > > + total <<= 1; > > > + for (alloced = 0; alloced < total ; alloced += > > > XE_BO_SHRINK_SIZE) { > > > + struct xe_bo *bo; > > > + unsigned int mem_type; > > > + > > > + link = kzalloc(sizeof(*link), GFP_KERNEL); > > > + if (!link) { > > > + KUNIT_FAIL(test, "Unexpeced link > > > allocation failure\n"); > > > + break; > > > + } > > > + > > > + INIT_LIST_HEAD(&link->link); > > > + > > > + /* We can create bos using WC caching here. But it > > > is slower. */ > > > + bo = xe_bo_create_user(xe, NULL, NULL, > > > XE_BO_SHRINK_SIZE, > > > + DRM_XE_GEM_CPU_CACHING_WB, > > > + ttm_bo_type_device, > > > + XE_BO_FLAG_SYSTEM); > > > + if (IS_ERR(bo)) { > > > + if (bo != ERR_PTR(-ENOMEM) && bo != > > > ERR_PTR(-ENOSPC) && > > > + bo != ERR_PTR(-EINTR) && bo != > > > ERR_PTR(-ERESTARTSYS)) > > > + KUNIT_FAIL(test, "Error creating > > > bo: %pe\n", bo); > > > + kfree(link); > > > + break; > > > + } > > > + link->bo = bo; > > > + list_add_tail(&link->link, &bos); > > > + xe_bo_lock(bo, false); > > > + > > > + /* > > > + * If we're low on swap entries, we can't shrink > > > unless the bo > > > + * is marked purgeable. > > > + */ > > > + if (get_nr_swap_pages() < (XE_BO_SHRINK_SIZE >> > > > PAGE_SHIFT) * 128) { > > > + struct xe_ttm_tt *xe_tt = > > > + container_of(bo->ttm.ttm, > > > typeof(*xe_tt), ttm); > > > + long num_pages = xe_tt->ttm.num_pages; > > > + > > > + xe_tt->purgeable = true; > > > + xe_shrinker_mod_pages(xe->mem.shrinker, - > > > num_pages, > > > + num_pages); > > > + } > > > + > > > + mem_type = bo->ttm.resource->mem_type; > > > + xe_bo_unlock(bo); > > > + if (mem_type != XE_PL_TT) > > > + KUNIT_FAIL(test, "Bo in incorrect memory > > > type: %u\n", > > > + bo->ttm.resource->mem_type); > > > + cond_resched(); > > > + if (signal_pending(current)) > > > + break; > > > + } > > > + > > > + /* Read back and destroy bos */ > > > + list_for_each_entry_safe_reverse(link, next, &bos, link) { > > > + static struct ttm_operation_ctx ctx = > > > {.interruptible = true}; > > > + struct xe_bo *bo = link->bo; > > > + int ret; > > > + > > > + if (!signal_pending(current)) { > > > + xe_bo_lock(bo, NULL); > > > + ret = ttm_bo_validate(&bo->ttm, > > > &tt_placement, &ctx); > > > + xe_bo_unlock(bo); > > > + if (ret && ret != -EINTR) > > > + KUNIT_FAIL(test, "Validation > > > failed: %pe\n", > > > + ERR_PTR(ret)); > > > + else if (ret) > > > + interrupted++; > > > + else > > > + successful++; > > > + } > > > + xe_bo_put(link->bo); > > > + list_del(&link->link); > > > + kfree(link); > > > + cond_resched(); > > > + } > > > + kunit_info(test, "Readbacks interrupted: %u successful: > > > %u\n", > > > + interrupted, successful); > > > + > > > + return 0; > > > +} > > > + > > > +void xe_bo_shrink_kunit(struct kunit *test) > > > +{ > > > + xe_call_for_each_device(shrink_test_run_device); > > > +} > > > +EXPORT_SYMBOL_IF_KUNIT(xe_bo_shrink_kunit); > > > diff --git a/drivers/gpu/drm/xe/tests/xe_bo_test.c > > > b/drivers/gpu/drm/xe/tests/xe_bo_test.c > > > index a324cde77db8..317fa923e287 100644 > > > --- a/drivers/gpu/drm/xe/tests/xe_bo_test.c > > > +++ b/drivers/gpu/drm/xe/tests/xe_bo_test.c > > > @@ -10,6 +10,7 @@ > > > static struct kunit_case xe_bo_tests[] = { > > > KUNIT_CASE(xe_ccs_migrate_kunit), > > > KUNIT_CASE(xe_bo_evict_kunit), > > > + KUNIT_CASE_SLOW(xe_bo_shrink_kunit), > > > {} > > > }; > > > > > > diff --git a/drivers/gpu/drm/xe/tests/xe_bo_test.h > > > b/drivers/gpu/drm/xe/tests/xe_bo_test.h > > > index 0113ab45066a..7f44d14a45c5 100644 > > > --- a/drivers/gpu/drm/xe/tests/xe_bo_test.h > > > +++ b/drivers/gpu/drm/xe/tests/xe_bo_test.h > > > @@ -10,5 +10,6 @@ struct kunit; > > > > > > void xe_ccs_migrate_kunit(struct kunit *test); > > > void xe_bo_evict_kunit(struct kunit *test); > > > +void xe_bo_shrink_kunit(struct kunit *test); > > > > > > #endif > > > diff --git a/drivers/gpu/drm/xe/xe_bo.c > > > b/drivers/gpu/drm/xe/xe_bo.c > > > index 65c696966e96..6ab63d1642ae 100644 > > > --- a/drivers/gpu/drm/xe/xe_bo.c > > > +++ b/drivers/gpu/drm/xe/xe_bo.c > > > @@ -10,6 +10,7 @@ > > > #include <drm/drm_drv.h> > > > #include <drm/drm_gem_ttm_helper.h> > > > #include <drm/drm_managed.h> > > > +#include <drm/ttm/ttm_backup.h> > > > #include <drm/ttm/ttm_device.h> > > > #include <drm/ttm/ttm_placement.h> > > > #include <drm/ttm/ttm_tt.h> > > > @@ -25,6 +26,7 @@ > > > #include "xe_pm.h" > > > #include "xe_preempt_fence.h" > > > #include "xe_res_cursor.h" > > > +#include "xe_shrinker.h" > > > #include "xe_trace_bo.h" > > > #include "xe_ttm_stolen_mgr.h" > > > #include "xe_vm.h" > > > @@ -278,11 +280,15 @@ static void xe_evict_flags(struct > > > ttm_buffer_object *tbo, > > > } > > > } > > > > > > +/* struct xe_ttm_tt - Subclassed ttm_tt for xe */ > > > struct xe_ttm_tt { > > > struct ttm_tt ttm; > > > - struct device *dev; > > > + /** @xe - The xe device */ > > > + struct xe_device *xe; > > > struct sg_table sgt; > > > struct sg_table *sg; > > > + /** @purgeable - Whether the bo is purgeable (WONTNEED) */ > > > > So we need to add WONTNEED to our uAPI, right? > > Not strictly, but we have work ongoing to implement this. Bos that UMD > pool are typically WONTNEED and unless we mark them as such, shrinking > of these will not work unless we have swap-space available, and then > only with a completely unnecessary copy... > Makes sense. > > > > > + bool purgeable; > > > }; > > > > > > static int xe_tt_map_sg(struct ttm_tt *tt) > > > @@ -291,7 +297,8 @@ static int xe_tt_map_sg(struct ttm_tt *tt) > > > unsigned long num_pages = tt->num_pages; > > > int ret; > > > > > > - XE_WARN_ON(tt->page_flags & TTM_TT_FLAG_EXTERNAL); > > > + XE_WARN_ON((tt->page_flags & TTM_TT_FLAG_EXTERNAL) && > > > + !(tt->page_flags & > > > TTM_TT_FLAG_EXTERNAL_MAPPABLE)); > > > > > > if (xe_tt->sg) > > > return 0; > > > @@ -299,13 +306,13 @@ static int xe_tt_map_sg(struct ttm_tt *tt) > > > ret = sg_alloc_table_from_pages_segment(&xe_tt->sgt, tt- > > > >pages, > > > num_pages, 0, > > > (u64)num_pages << > > > PAGE_SHIFT, > > > - > > > xe_sg_segment_size(xe_tt->dev), > > > + xe_sg_segment_size > > > (xe_tt->xe->drm.dev), > > > GFP_KERNEL); > > > if (ret) > > > return ret; > > > > > > xe_tt->sg = &xe_tt->sgt; > > > - ret = dma_map_sgtable(xe_tt->dev, xe_tt->sg, > > > DMA_BIDIRECTIONAL, > > > + ret = dma_map_sgtable(xe_tt->xe->drm.dev, xe_tt->sg, > > > DMA_BIDIRECTIONAL, > > > DMA_ATTR_SKIP_CPU_SYNC); > > > if (ret) { > > > sg_free_table(xe_tt->sg); > > > @@ -321,7 +328,7 @@ static void xe_tt_unmap_sg(struct ttm_tt *tt) > > > struct xe_ttm_tt *xe_tt = container_of(tt, struct > > > xe_ttm_tt, ttm); > > > > > > if (xe_tt->sg) { > > > - dma_unmap_sgtable(xe_tt->dev, xe_tt->sg, > > > + dma_unmap_sgtable(xe_tt->xe->drm.dev, xe_tt->sg, > > > DMA_BIDIRECTIONAL, 0); > > > sg_free_table(xe_tt->sg); > > > xe_tt->sg = NULL; > > > @@ -336,21 +343,41 @@ struct sg_table *xe_bo_sg(struct xe_bo *bo) > > > return xe_tt->sg; > > > } > > > > > > +/* > > > + * Account ttm pages against the device shrinker's shrinkable and > > > + * purgeable counts. > > > + */ > > > +static void xe_ttm_tt_account(struct ttm_tt *tt, bool add) > > > +{ > > > > Again I think bools are frowned upon as arguments. Maybe just two > > functions - add / sub? > > OK, > > > > > > + struct xe_ttm_tt *xe_tt = container_of(tt, struct > > > xe_ttm_tt, ttm); > > > + long num_pages = tt->num_pages; > > > + > > > + if (!add) > > > + num_pages = -num_pages; > > > + > > > + if (xe_tt->purgeable) > > > + xe_shrinker_mod_pages(xe_tt->xe->mem.shrinker, 0, > > > num_pages); > > > + else > > > + xe_shrinker_mod_pages(xe_tt->xe->mem.shrinker, > > > num_pages, 0); > > > +} > > > + > > > static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object > > > *ttm_bo, > > > u32 page_flags) > > > { > > > struct xe_bo *bo = ttm_to_xe_bo(ttm_bo); > > > struct xe_device *xe = xe_bo_device(bo); > > > - struct xe_ttm_tt *tt; > > > + struct xe_ttm_tt *xe_tt; > > > + struct ttm_tt *tt; > > > unsigned long extra_pages; > > > enum ttm_caching caching; > > > int err; > > > > > > - tt = kzalloc(sizeof(*tt), GFP_KERNEL); > > > - if (!tt) > > > + xe_tt = kzalloc(sizeof(*xe_tt), GFP_KERNEL); > > > + if (!xe_tt) > > > return NULL; > > > > > > - tt->dev = xe->drm.dev; > > > + tt = &xe_tt->ttm; > > > + xe_tt->xe = xe; > > > > > > extra_pages = 0; > > > if (xe_bo_needs_ccs_pages(bo)) > > > @@ -387,42 +414,128 @@ static struct ttm_tt > > > *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo, > > > caching = ttm_uncached; > > > } > > > > > > - err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, > > > extra_pages); > > > + if (ttm_bo->type != ttm_bo_type_sg) > > > + page_flags |= TTM_TT_FLAG_EXTERNAL | > > > TTM_TT_FLAG_EXTERNAL_MAPPABLE; > > > + > > > + err = ttm_tt_init(tt, &bo->ttm, page_flags, caching, > > > extra_pages); > > > if (err) { > > > - kfree(tt); > > > + kfree(xe_tt); > > > return NULL; > > > } > > > > > > - return &tt->ttm; > > > + tt->backup = ttm_backup_shmem_create(tt->num_pages << > > > PAGE_SHIFT); > > > + if (IS_ERR(tt->backup)) { > > > + ttm_tt_fini(tt); > > > > Mentioned this the previous review I think you need set tt->backup to > > NULL here or update ttm_tt_fini to understand IS_ERR(tt->backup). > > > > Also maybe dump question but could we just have a global backup for > > all > > BOs? Would that be better than each BO creating its own backup? > > I initially made the code like that, when we had a global backend > directly into the swap cache. But I figure shmem wants one file per bo, > otherwise we'd need one gigantic shmem object... Not sure if that > works... > If one backup per shmem sense, then let's do that. I do think we should remove the restriction of only supporting 1 backup per BO so if a driver wants to use a global backup it can. e.g. Don't fini / free the backup in ttm_tt_fini rather have the driver completely own the backup field. Kinda goofy for the driver to setup the field but TTM to fini / free it. > > > > > + kfree(xe_tt); > > > + return NULL; > > > + } > > > + > > > + return tt; > > > } > > > > > > static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct > > > ttm_tt *tt, > > > struct ttm_operation_ctx *ctx) > > > { > > > + struct xe_ttm_tt *xe_tt = container_of(tt, struct > > > xe_ttm_tt, ttm); > > > int err; > > > > > > /* > > > * dma-bufs are not populated with pages, and the dma- > > > * addresses are set up when moved to XE_PL_TT. > > > */ > > > - if (tt->page_flags & TTM_TT_FLAG_EXTERNAL) > > > + if ((tt->page_flags & TTM_TT_FLAG_EXTERNAL) && > > > + !(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE)) > > > return 0; > > > > > > err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx); > > > if (err) > > > return err; > > > > > > - return err; > > > + xe_tt->purgeable = false; > > > + xe_ttm_tt_account(tt, true); > > > + > > > + return 0; > > > } > > > > > > static void xe_ttm_tt_unpopulate(struct ttm_device *ttm_dev, > > > struct ttm_tt *tt) > > > { > > > - if (tt->page_flags & TTM_TT_FLAG_EXTERNAL) > > > + if ((tt->page_flags & TTM_TT_FLAG_EXTERNAL) && > > > + !(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE)) > > > return; > > > > > > xe_tt_unmap_sg(tt); > > > > > > - return ttm_pool_free(&ttm_dev->pool, tt); > > > + ttm_pool_free(&ttm_dev->pool, tt); > > > + xe_ttm_tt_account(tt, false); > > > +} > > > + > > > +/** > > > + * xe_bo_shrink() - Try to shrink an xe bo. > > > + * @walk: - The walk parameters > > > + * @bo: The TTM buffer object > > > + * @purge: Only consider purgeable bos. > > > + * @writeback: Try to write back to persistent storage. > > > + * > > > + * Try to shrink- or purge a bo, and if it succeeds, unmap dma. > > > + * Note that we need to be able to handle also non xe bos > > > + * (ghost bos), but only if the struct ttm_tt is embedded in > > > + * a struct xe_ttm_tt. > > > + * > > > + * Return: The number of pages shrunken or purged, or negative > > > error > > > + * code on failure. > > > + */ > > > +long xe_bo_shrink(struct ttm_lru_walk *walk, struct > > > ttm_buffer_object *bo, > > > + bool purge, bool writeback) > > > +{ > > > + struct ttm_tt *tt = bo->ttm; > > > + struct xe_ttm_tt *xe_tt = container_of(tt, struct > > > xe_ttm_tt, ttm); > > > + struct ttm_place place = {.mem_type = bo->resource- > > > >mem_type}; > > > + struct xe_bo *xe_bo = ttm_to_xe_bo(bo); > > > + struct xe_device *xe = xe_tt->xe; > > > + bool needs_rpm; > > > + long lret = 0L; > > > + > > > + if (!tt || !ttm_tt_is_populated(tt) || > > > + !(tt->page_flags & TTM_TT_FLAG_EXTERNAL_MAPPABLE) || > > > + (purge && !xe_tt->purgeable)) > > > + return 0L; > > > + > > > + if (!ttm_bo_eviction_valuable(bo, &place)) > > > + return 0L; > > > + > > > + /* Beware of zombies (GEM object refcount == 0) and > > > ghosts. */ > > > + if (!xe_bo_is_xe_bo(bo) || !xe_bo_get_unless_zero(xe_bo)) > > > { > > > + struct ttm_placement null_placement = { > > > .num_placement = 0 }; > > > + > > > + lret = ttm_bo_wait_ctx(bo, walk->ctx); > > > + if (lret) > > > + return lret; > > > + > > > + /* Purge the bo content! */ > > > + ttm_bo_validate(bo, &null_placement, walk->ctx); > > > + return tt->num_pages; > > > + } > > > + > > > + /* System CCS needs gpu copy when moving PL_TT -> > > > PL_SYSTEM */ > > > + needs_rpm = (!IS_DGFX(xe) && bo->resource->mem_type != > > > XE_PL_SYSTEM && > > > + xe_bo && xe_bo_needs_ccs_pages(xe_bo) && > > > !xe_tt->purgeable); > > > > Is xe_bo check really needed here? > > Yes, I think otherwise xe_bo_needs_ccs_pages will be called with NULL > for ghost objects that aren't xe_bo. > Got it. > > > > > + if (needs_rpm && !xe_pm_runtime_get_if_active(xe)) > > > + goto out_unref; > > > + > > > + lret = ttm_bo_try_shrink(walk, bo, xe_tt->purgeable, > > > writeback); > > > + if (needs_rpm) > > > + xe_pm_runtime_put(xe); > > > + > > > + if (lret > 0) { > > > + xe_assert(xe, !ttm_tt_is_populated(tt)); > > > + > > > + xe_ttm_tt_account(tt, false); > > > + } > > > + > > > +out_unref: > > > + xe_bo_put(xe_bo); > > > + > > > + return lret; > > > } > > > > > > static void xe_ttm_tt_destroy(struct ttm_device *ttm_dev, struct > > > ttm_tt *tt) > > > @@ -1238,6 +1351,7 @@ struct xe_bo *___xe_bo_create_locked(struct > > > xe_device *xe, struct xe_bo *bo, > > > struct ttm_operation_ctx ctx = { > > > .interruptible = true, > > > .no_wait_gpu = false, > > > + .gfp_retry_mayfail = true, > > > > Can you explain why you are setting this? > > Hm. We might want this in a separate patch. Without this, bo memory > allocation will typically never fail, but instead start the OOM killer. > I don't think that's the behaviour we want, but yeah deserves its own > patch and review. > Sounds good. > > > > > }; > > > struct ttm_placement *placement; > > > uint32_t alignment; > > > @@ -1681,6 +1795,8 @@ int xe_bo_pin_external(struct xe_bo *bo) > > > } > > > > > > ttm_bo_pin(&bo->ttm); > > > + if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm)) > > > + xe_ttm_tt_account(bo->ttm.ttm, false); > > > > > > /* > > > * FIXME: If we always use the reserve / unreserve > > > functions for locking > > > @@ -1739,6 +1855,8 @@ int xe_bo_pin(struct xe_bo *bo) > > > } > > > > > > ttm_bo_pin(&bo->ttm); > > > + if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm)) > > > + xe_ttm_tt_account(bo->ttm.ttm, false); > > > > > > /* > > > * FIXME: If we always use the reserve / unreserve > > > functions for locking > > > @@ -1773,6 +1891,9 @@ void xe_bo_unpin_external(struct xe_bo *bo) > > > spin_unlock(&xe->pinned.lock); > > > > > > ttm_bo_unpin(&bo->ttm); > > > + if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm)) > > > + xe_ttm_tt_account(bo->ttm.ttm, true); > > > + > > > > Nit: Extra newline. > Will fix. > > > > > > > > /* > > > * FIXME: If we always use the reserve / unreserve > > > functions for locking > > > @@ -1801,6 +1922,8 @@ void xe_bo_unpin(struct xe_bo *bo) > > > } > > > > > > ttm_bo_unpin(&bo->ttm); > > > + if (bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm)) > > > + xe_ttm_tt_account(bo->ttm.ttm, true); > > > } > > > > > > /** > > > diff --git a/drivers/gpu/drm/xe/xe_bo.h > > > b/drivers/gpu/drm/xe/xe_bo.h > > > index 6de894c728f5..8463e3f3f6f1 100644 > > > --- a/drivers/gpu/drm/xe/xe_bo.h > > > +++ b/drivers/gpu/drm/xe/xe_bo.h > > > @@ -63,6 +63,7 @@ > > > #define XE_BO_PROPS_INVALID (-1) > > > > > > struct sg_table; > > > +struct xe_ttm_lru_walk; > > > > > > struct xe_bo *xe_bo_alloc(void); > > > void xe_bo_free(struct xe_bo *bo); > > > @@ -126,6 +127,28 @@ static inline struct xe_bo *xe_bo_get(struct > > > xe_bo *bo) > > > return bo; > > > } > > > > > > +/* > > > + * xe_bo_get_unless_zero() - Conditionally obtain a GEM object > > > refcount on an > > > + * xe bo > > > + * @bo: The bo for which we want to obtain a refcount. > > > + * > > > + * There is a short window between where the bo's GEM object > > > refcount reaches > > > + * zero and where we put the final ttm_bo reference. Code in the > > > eviction- and > > > + * shrinking path should therefore attempt to grab a gem object > > > reference before > > > + * trying to use members outside of the base class ttm object. > > > This function is > > > + * intended for that purpose. On successful return, this function > > > must be paired > > > + * with an xe_bo_put(). > > > + * > > > + * Return: @bo on success, NULL on failure. > > > + */ > > > +static inline __must_check struct xe_bo > > > *xe_bo_get_unless_zero(struct xe_bo *bo) > > > +{ > > > + if (!bo || !kref_get_unless_zero(&bo->ttm.base.refcount)) > > > + return NULL; > > > + > > > + return bo; > > > +} > > > + > > > static inline void xe_bo_put(struct xe_bo *bo) > > > { > > > if (bo) > > > @@ -315,6 +338,9 @@ static inline unsigned int > > > xe_sg_segment_size(struct device *dev) > > > > > > #define > > > i915_gem_object_flush_if_display(obj) ((void)(obj)) > > > > > > +long xe_bo_shrink(struct ttm_lru_walk *walk, struct > > > ttm_buffer_object *bo, > > > + bool purge, bool writeback); > > > + > > > #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) > > > /** > > > * xe_bo_is_mem_type - Whether the bo currently resides in the > > > given > > > diff --git a/drivers/gpu/drm/xe/xe_device.c > > > b/drivers/gpu/drm/xe/xe_device.c > > > index cfda7cb5df2c..58fecc4b0a18 100644 > > > --- a/drivers/gpu/drm/xe/xe_device.c > > > +++ b/drivers/gpu/drm/xe/xe_device.c > > > @@ -47,6 +47,7 @@ > > > #include "xe_perf.h" > > > #include "xe_pm.h" > > > #include "xe_query.h" > > > +#include "xe_shrinker.h" > > > #include "xe_sriov.h" > > > #include "xe_tile.h" > > > #include "xe_ttm_stolen_mgr.h" > > > @@ -241,6 +242,9 @@ static void xe_device_destroy(struct drm_device > > > *dev, void *dummy) > > > if (xe->unordered_wq) > > > destroy_workqueue(xe->unordered_wq); > > > > > > + if (!IS_ERR_OR_NULL(xe->mem.shrinker)) > > > + xe_shrinker_destroy(xe->mem.shrinker); > > > + > > > ttm_device_fini(&xe->ttm); > > > } > > > > > > @@ -270,6 +274,10 @@ struct xe_device *xe_device_create(struct > > > pci_dev *pdev, > > > if (err) > > > goto err; > > > > > > + xe->mem.shrinker = xe_shrinker_create(xe); > > > + if (IS_ERR(xe->mem.shrinker)) > > > + return ERR_CAST(xe->mem.shrinker); > > > + > > > xe->info.devid = pdev->device; > > > xe->info.revid = pdev->revision; > > > xe->info.force_execlist = xe_modparam.force_execlist; > > > diff --git a/drivers/gpu/drm/xe/xe_device_types.h > > > b/drivers/gpu/drm/xe/xe_device_types.h > > > index c37be471d11c..3d5440aba52e 100644 > > > --- a/drivers/gpu/drm/xe/xe_device_types.h > > > +++ b/drivers/gpu/drm/xe/xe_device_types.h > > > @@ -325,6 +325,8 @@ struct xe_device { > > > struct xe_mem_region vram; > > > /** @mem.sys_mgr: system TTM manager */ > > > struct ttm_resource_manager sys_mgr; > > > + /** @mem.sys_mgr: system memory shrinker. */ > > > + struct xe_shrinker *shrinker; > > > } mem; > > > > > > /** @sriov: device level virtualization data */ > > > diff --git a/drivers/gpu/drm/xe/xe_shrinker.c > > > b/drivers/gpu/drm/xe/xe_shrinker.c > > > new file mode 100644 > > > index 000000000000..3f9554bdc06b > > > --- /dev/null > > > +++ b/drivers/gpu/drm/xe/xe_shrinker.c > > > @@ -0,0 +1,287 @@ > > > +// SPDX-License-Identifier: MIT > > > +/* > > > + * Copyright © 2024 Intel Corporation > > > + */ > > > + > > > +#include <linux/shrinker.h> > > > +#include <linux/swap.h> > > > + > > > +#include <drm/ttm/ttm_bo.h> > > > +#include <drm/ttm/ttm_tt.h> > > > + > > > +#include "xe_bo.h" > > > +#include "xe_pm.h" > > > +#include "xe_shrinker.h" > > > + > > > +/** > > > + * struct xe_shrinker - per-device shrinker > > > + * @xe: Back pointer to the device. > > > + * @lock: Lock protecting accounting. > > > + * @shrinkable_pages: Number of pages that are currently > > > shrinkable. > > > + * @purgeable_pages: Number of pages that are currently purgeable. > > > + * @shrink: Pointer to the mm shrinker. > > > + * @pm_worker: Worker to wake up the device if required. > > > + */ > > > +struct xe_shrinker { > > > + struct xe_device *xe; > > > + rwlock_t lock; > > > + long shrinkable_pages; > > > + long purgeable_pages; > > > + struct shrinker *shrink; > > > + struct work_struct pm_worker; > > > +}; > > > + > > > +/** > > > + * struct xe_shrink_lru_walk - lru_walk subclass for shrinker > > > + * @walk: The embedded base class. > > > + * @xe: Pointer to the xe device. > > > + * @purge: Purgeable only request from the srinker. > > > + * @writeback: Try to write back to persistent storage. > > > + */ > > > +struct xe_shrink_lru_walk { > > > + struct ttm_lru_walk walk; > > > + struct xe_device *xe; > > > + bool purge; > > > + bool writeback; > > > +}; > > > + > > > +static struct xe_shrinker *to_xe_shrinker(struct shrinker *shrink) > > > +{ > > > + return shrink->private_data; > > > +} > > > + > > > +static struct xe_shrink_lru_walk * > > > +to_xe_shrink_lru_walk(struct ttm_lru_walk *walk) > > > +{ > > > + return container_of(walk, struct xe_shrink_lru_walk, > > > walk); > > > +} > > > + > > > +/** > > > + * xe_shrinker_mod_pages() - Modify shrinker page accounting > > > + * @shrinker: Pointer to the struct xe_shrinker. > > > + * @shrinkable: Shrinkable pages delta. May be negative. > > > + * @purgeable: Purgeable page delta. May be negative. > > > + * > > > + * Modifies the shrinkable and purgeable pages accounting. > > > + */ > > > +void > > > +xe_shrinker_mod_pages(struct xe_shrinker *shrinker, long > > > shrinkable, long purgeable) > > > +{ > > > + write_lock(&shrinker->lock); > > > + shrinker->shrinkable_pages += shrinkable; > > > + shrinker->purgeable_pages += purgeable; > > > + write_unlock(&shrinker->lock); > > > +} > > > + > > > +static long xe_shrinker_process_bo(struct ttm_lru_walk *walk, > > > struct ttm_buffer_object *bo) > > > +{ > > > + struct xe_shrink_lru_walk *shrink_walk = > > > to_xe_shrink_lru_walk(walk); > > > + > > > + return xe_bo_shrink(walk, bo, shrink_walk->purge, > > > shrink_walk->writeback); > > > +} > > > + > > > +static long xe_shrinker_walk(struct xe_shrink_lru_walk > > > *shrink_walk, long target) > > > +{ > > > + struct xe_device *xe = shrink_walk->xe; > > > + struct ttm_resource_manager *man; > > > + unsigned int mem_type; > > > + long sofar = 0; > > > + long lret; > > > + > > > + for (mem_type = XE_PL_SYSTEM; mem_type <= XE_PL_TT; > > > ++mem_type) { > > > + man = ttm_manager_type(&xe->ttm, mem_type); > > > + if (!man || !man->use_tt) > > > + continue; > > > + > > > + lret = ttm_lru_walk_for_evict(&shrink_walk->walk, > > > &xe->ttm, man, target); > > > + if (lret < 0) > > > + return lret; > > > + > > > + sofar += lret; > > > + if (sofar >= target) > > > + break; > > > + } > > > + > > > + return sofar; > > > +} > > > + > > > +static unsigned long > > > +xe_shrinker_count(struct shrinker *shrink, struct shrink_control > > > *sc) > > > +{ > > > + struct xe_shrinker *shrinker = to_xe_shrinker(shrink); > > > + unsigned long num_pages; > > > + > > > + num_pages = get_nr_swap_pages(); > > > + read_lock(&shrinker->lock); > > > + num_pages = min_t(unsigned long, num_pages, shrinker- > > > >shrinkable_pages); > > > + num_pages += shrinker->purgeable_pages; > > > + read_unlock(&shrinker->lock); > > > + > > > + return num_pages ? num_pages : SHRINK_EMPTY; > > > +} > > > + > > > +static const struct ttm_lru_walk_ops xe_shrink_ops = { > > > + .process_bo = xe_shrinker_process_bo, > > > +}; > > > + > > > +/* > > > + * Check if we need runtime pm, and if so try to grab a reference > > > if > > > + * already active. If grabbing a reference fails, queue a worker > > > that > > > + * does it for us outside of reclaim, but don't wait for it to > > > complete. > > > + * If bo shrinking needs an rpm reference and we don't have it > > > (yet), > > > + * that bo will be skipped anyway. > > > + */ > > > +static bool xe_shrinker_runtime_pm_get(struct xe_shrinker > > > *shrinker, bool force, > > > + unsigned long nr_to_scan) > > > +{ > > > + struct xe_device *xe = shrinker->xe; > > > + > > > + if (IS_DGFX(xe) || !xe_device_has_flat_ccs(xe) || > > > + !get_nr_swap_pages()) > > > + return false; > > > + > > > + if (!force) { > > > + read_lock(&shrinker->lock); > > > + force = (nr_to_scan > shrinker->purgeable_pages); > > > + read_unlock(&shrinker->lock); > > > + if (!force) > > > + return false; > > > + } > > > + > > > + if (!xe_pm_runtime_get_if_active(xe)) { > > > + queue_work(xe->unordered_wq, &shrinker- > > > >pm_worker); > > > + return false; > > > + } > > > + > > > + return true; > > > +} > > > + > > > +static void xe_shrinker_runtime_pm_put(struct xe_shrinker > > > *shrinker, bool runtime_pm) > > > +{ > > > + if (runtime_pm) > > > + xe_pm_runtime_put(shrinker->xe); > > > +} > > > + > > > +static unsigned long xe_shrinker_scan(struct shrinker *shrink, > > > struct shrink_control *sc) > > > +{ > > > + struct xe_shrinker *shrinker = to_xe_shrinker(shrink); > > > + bool is_kswapd = current_is_kswapd(); > > > + struct ttm_operation_ctx ctx = { > > > + .interruptible = false, > > > + .no_wait_gpu = !is_kswapd, > > > + }; > > > + unsigned long nr_to_scan, freed = 0; > > > + struct xe_shrink_lru_walk shrink_walk = { > > > + .walk = { > > > + .ops = &xe_shrink_ops, > > > + .ctx = &ctx, > > > + .trylock_only = true, > > > + }, > > > + .xe = shrinker->xe, > > > + .purge = true, > > > + .writeback = is_kswapd, > > > + }; > > > + bool runtime_pm; > > > + bool purgeable; > > > + long ret; > > > + > > > + sc->nr_scanned = 0; > > > + nr_to_scan = sc->nr_to_scan; > > > + > > > + read_lock(&shrinker->lock); > > > + purgeable = !!shrinker->purgeable_pages; > > > + read_unlock(&shrinker->lock); > > > + > > > + /* Might need runtime PM. Try to wake early if it looks > > > like it. */ > > > + runtime_pm = xe_shrinker_runtime_pm_get(shrinker, false, > > > nr_to_scan); > > > + > > > + while (purgeable && freed < nr_to_scan) { > > > + ret = xe_shrinker_walk(&shrink_walk, nr_to_scan); > > > + if (ret <= 0) > > > + break; > > > + > > > + freed += ret; > > > + } > > > + > > > + sc->nr_scanned = freed; > > > + if (freed < nr_to_scan) > > > + nr_to_scan -= freed; > > > + else > > > + nr_to_scan = 0; > > > + if (!nr_to_scan) > > > + goto out; > > > + > > > + /* If we didn't wake before, try to do it now if needed. > > > */ > > > + if (!runtime_pm) > > > + runtime_pm = xe_shrinker_runtime_pm_get(shrinker, > > > true, 0); > > > + > > > + shrink_walk.purge = false; > > > + nr_to_scan = sc->nr_to_scan; > > > + while (freed < nr_to_scan) { > > > + ret = xe_shrinker_walk(&shrink_walk, nr_to_scan); > > > + if (ret <= 0) > > > + break; > > > + > > > + freed += ret; > > > + } > > > + > > > + sc->nr_scanned = freed; > > > + > > > +out: > > > + xe_shrinker_runtime_pm_put(shrinker, runtime_pm); > > > + return freed ? freed : SHRINK_STOP; > > > +} > > > + > > > +/* Wake up the device for shrinking. */ > > > +static void xe_shrinker_pm(struct work_struct *work) > > > +{ > > > + struct xe_shrinker *shrinker = > > > + container_of(work, typeof(*shrinker), pm_worker); > > > + > > > + xe_pm_runtime_get(shrinker->xe); > > > + xe_pm_runtime_put(shrinker->xe); > > > > So I don't really get this. How does this help the shrinker get a PM > > ref? The small window between xe_pm_runtime_get / put the shrinker > > grabs > > one via xe_pm_runtime_get_if_active? Seems fragile. > > The pm has a delay after the last put (Typically 1s?), and if the > shrinker fails obtaining a reference it will be retried. Typically it > will then succeed and when it does start a new delay. > Ah, forgot that their is a delay on the final put. This make sense now. Can't really think of a better solution that this at the moment. Maybe add a comment indicating this? Matt > This doesn't strictly guarantee shrinking progress, but there's no way > we can synchronously delay shrinking until we have the pm reference. > > What we could perhaps do is to report back that we actually made > progress but postpone the actual shrinking to the worker with writeback > set to on; hoping that the shrinking core doesn't give up before it > sees the released memory... > > But I'd prefer to leave that out until we see that that's actually a > real problem. > > Best would be if we could have access the LNL CCS memory using the > CPU... > > /Thomas > > > > > > Matt > > > > > +} > > > + > > > +/** > > > + * xe_shrinker_create() - Create an xe per-device shrinker > > > + * @xe: Pointer to the xe device. > > > + * > > > + * Returns: A pointer to the created shrinker on success, > > > + * Negative error code on failure. > > > + */ > > > +struct xe_shrinker *xe_shrinker_create(struct xe_device *xe) > > > +{ > > > + struct xe_shrinker *shrinker = kzalloc(sizeof(*shrinker), > > > GFP_KERNEL); > > > + > > > + if (!shrinker) > > > + return ERR_PTR(-ENOMEM); > > > + > > > + shrinker->shrink = shrinker_alloc(0, "xe system > > > shrinker"); > > > + if (!shrinker->shrink) { > > > + kfree(shrinker); > > > + return ERR_PTR(-ENOMEM); > > > + } > > > + > > > + INIT_WORK(&shrinker->pm_worker, xe_shrinker_pm); > > > + shrinker->xe = xe; > > > + rwlock_init(&shrinker->lock); > > > + shrinker->shrink->count_objects = xe_shrinker_count; > > > + shrinker->shrink->scan_objects = xe_shrinker_scan; > > > + shrinker->shrink->private_data = shrinker; > > > + shrinker_register(shrinker->shrink); > > > + > > > + return shrinker; > > > +} > > > + > > > +/** > > > + * xe_shrinker_destroy() - Destroy an xe per-device shrinker > > > + * @shrinker: Pointer to the shrinker to destroy. > > > + */ > > > +void xe_shrinker_destroy(struct xe_shrinker *shrinker) > > > +{ > > > + xe_assert(shrinker->xe, !shrinker->shrinkable_pages); > > > + xe_assert(shrinker->xe, !shrinker->purgeable_pages); > > > + shrinker_free(shrinker->shrink); > > > + flush_work(&shrinker->pm_worker); > > > + kfree(shrinker); > > > +} > > > diff --git a/drivers/gpu/drm/xe/xe_shrinker.h > > > b/drivers/gpu/drm/xe/xe_shrinker.h > > > new file mode 100644 > > > index 000000000000..28a038f4fcbf > > > --- /dev/null > > > +++ b/drivers/gpu/drm/xe/xe_shrinker.h > > > @@ -0,0 +1,18 @@ > > > +/* SPDX-License-Identifier: MIT */ > > > +/* > > > + * Copyright © 2024 Intel Corporation > > > + */ > > > + > > > +#ifndef _XE_SHRINKER_H_ > > > +#define _XE_SHRINKER_H_ > > > + > > > +struct xe_shrinker; > > > +struct xe_device; > > > + > > > +void xe_shrinker_mod_pages(struct xe_shrinker *shrinker, long > > > shrinkable, long purgeable); > > > + > > > +struct xe_shrinker *xe_shrinker_create(struct xe_device *xe); > > > + > > > +void xe_shrinker_destroy(struct xe_shrinker *shrinker); > > > + > > > +#endif > > > diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h > > > index e577528f5dfc..c7e81ae025d9 100644 > > > --- a/include/drm/ttm/ttm_bo.h > > > +++ b/include/drm/ttm/ttm_bo.h > > > @@ -229,6 +229,9 @@ struct ttm_lru_walk { > > > long ttm_lru_walk_for_evict(struct ttm_lru_walk *walk, struct > > > ttm_device *bdev, > > > struct ttm_resource_manager *man, long > > > target); > > > > > > +long ttm_bo_try_shrink(struct ttm_lru_walk *walk, struct > > > ttm_buffer_object *bo, > > > + bool purge, bool writeback); > > > + > > > /** > > > * ttm_bo_get - reference a struct ttm_buffer_object > > > * > > > -- > > > 2.44.0 > > > >