Currently XE lacks clean-on-free implementation so using TTM_TT_FLAG_CLEARED_ON_FREE is invalid. Remove usage of TTM_TT_FLAG_CLEARED_ON_FREE and limit gpu system page clearing only for WB cached BOs which are not pooled so there is no need to return a zeroed pages to a pool. Without the patch: api_overhead_benchmark_l0 --testFilter=UsmMemoryAllocation: UsmMemoryAllocation(api=l0 type=Host size=4KB) 79.439 us UsmMemoryAllocation(api=l0 type=Host size=1GB) 98677.75 us Perf tool top 5 entries: 11.16% api_overhead_be [kernel.kallsyms] [k] __pageblock_pfn_to_page 7.85% api_overhead_be [kernel.kallsyms] [k] cpa_flush 7.59% api_overhead_be [kernel.kallsyms] [k] find_next_iomem_res 7.24% api_overhead_be [kernel.kallsyms] [k] pages_are_mergeable 5.53% api_overhead_be [kernel.kallsyms] [k] lookup_address_in_pgd_attr With the patch: UsmMemoryAllocation(api=l0 type=Host size=4KB) 78.164 us UsmMemoryAllocation(api=l0 type=Host size=1GB) 98880.39 us Perf tool top 5 entries: 25.40% api_overhead_be [kernel.kallsyms] [k] clear_page_erms 9.89% api_overhead_be [kernel.kallsyms] [k] pages_are_mergeable 4.64% api_overhead_be [kernel.kallsyms] [k] cpa_flush 4.04% api_overhead_be [kernel.kallsyms] [k] find_next_iomem_res 3.96% api_overhead_be [kernel.kallsyms] [k] mod_find This is still better than the base case where there was no page clearing offloading. Cc: Christian König <christian.koenig@xxxxxxx> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@xxxxxxxxx> Cc: Matthew Auld <matthew.auld@xxxxxxxxx> Cc: Matthew Brost <matthew.brost@xxxxxxxxx> Cc: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> Signed-off-by: Nirmoy Das <nirmoy.das@xxxxxxxxx> --- drivers/gpu/drm/xe/xe_bo.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 6ed0e1955215..a18408d5d185 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -283,6 +283,7 @@ struct xe_ttm_tt { struct device *dev; struct sg_table sgt; struct sg_table *sg; + bool clear_system_pages; }; static int xe_tt_map_sg(struct ttm_tt *tt) @@ -397,12 +398,17 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo, } /* - * If the device can support gpu clear system pages then set proper ttm + * If the device can support gpu clear system pages then set proper * flag. Zeroed pages are only required for ttm_bo_type_device so * unwanted data is not leaked to userspace. + * + * XE currently does clear-on-alloc so gpu clear will only work on + * non-pooled BO, DRM_XE_GEM_CPU_CACHING_WB otherwise global pool will + * get poisoned ono-zeroed pages. */ - if (ttm_bo->type == ttm_bo_type_device && xe->mem.gpu_page_clear_sys) - page_flags |= TTM_TT_FLAG_CLEARED_ON_FREE; + if (ttm_bo->type == ttm_bo_type_device && xe->mem.gpu_page_clear_sys && + bo->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB) + tt->clear_system_pages = true; err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, extra_pages); if (err) { @@ -416,8 +422,11 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo, static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt, struct ttm_operation_ctx *ctx) { + struct xe_ttm_tt *xe_tt; int err; + xe_tt = container_of(tt, struct xe_ttm_tt, ttm); + /* * dma-bufs are not populated with pages, and the dma- * addresses are set up when moved to XE_PL_TT. @@ -426,7 +435,7 @@ static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt, return 0; /* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear system pages */ - if (tt->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE) + if (xe_tt->clear_system_pages) tt->page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC; err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx); @@ -664,6 +673,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, struct ttm_resource *old_mem = ttm_bo->resource; u32 old_mem_type = old_mem ? old_mem->mem_type : XE_PL_SYSTEM; struct ttm_tt *ttm = ttm_bo->ttm; + struct xe_ttm_tt *xe_tt = container_of(ttm, struct xe_ttm_tt, ttm); struct xe_migrate *migrate = NULL; struct dma_fence *fence; bool move_lacks_source; @@ -671,15 +681,16 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, bool needs_clear; bool handle_system_ccs = (!IS_DGFX(xe) && xe_bo_needs_ccs_pages(bo) && ttm && ttm_tt_is_populated(ttm)) ? true : false; - bool clear_system_pages; + bool clear_system_pages = false; int ret = 0; /* * Clear TTM_TT_FLAG_CLEARED_ON_FREE on bo creation path when * moving to system as the bo doesn't have dma_mapping. */ - if (!old_mem && ttm && !ttm_tt_is_populated(ttm)) - ttm->page_flags &= ~TTM_TT_FLAG_CLEARED_ON_FREE; + if (!old_mem && ttm && !ttm_tt_is_populated(ttm) && + xe_tt->clear_system_pages) + xe_tt->clear_system_pages = false; /* Bo creation path, moving to system or TT. */ if ((!old_mem && ttm) && !handle_system_ccs) { @@ -703,7 +714,7 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared) : (!mem_type_is_vram(old_mem_type) && !tt_has_data); - clear_system_pages = ttm && (ttm->page_flags & TTM_TT_FLAG_CLEARED_ON_FREE); + clear_system_pages = ttm && xe_tt->clear_system_pages; needs_clear = (ttm && ttm->page_flags & TTM_TT_FLAG_ZERO_ALLOC) || (!ttm && ttm_bo->type == ttm_bo_type_device) || clear_system_pages; -- 2.42.0