On LNL because flat CCS, driver will create a migrate job to clear CCS meta data. Extend that to also clear pages using GPU with new ttm pool flag which allows offloading page clear activity to GPU. This gives very nice improvement for large buffer: Without the patch: ~/igt-gpu-tools/build/tests/xe_exec_store --run basic-store-benchmark IGT-Version: 1.28-g2ed908c0b (x86_64) (Linux: 6.9.0-xe+ x86_64) Using IGT_SRANDOM=1718889799 for randomisation Opened device: /dev/dri/card0 Starting subtest: basic-store-benchmark Starting dynamic subtest: WC Dynamic subtest WC: SUCCESS (0.000s) Time taken for size SZ_4K: 4882 us Time taken for size SZ_2M: 3679 us Time taken for size SZ_64M: 13367 us Time taken for size SZ_128M: 21034 us Time taken for size SZ_256M: 32940 us Time taken for size SZ_1G: 116261 us Starting dynamic subtest: WB Dynamic subtest WB: SUCCESS (0.000s) Time taken for size SZ_4K: 5417 us Time taken for size SZ_2M: 5711 us Time taken for size SZ_64M: 15718 us Time taken for size SZ_128M: 26170 us Time taken for size SZ_256M: 50529 us Time taken for size SZ_1G: 177933 us Subtest basic-store-benchmark: SUCCESS (0.504s) With the patch: sudo ~/igt-gpu-tools/build/tests/xe_exec_store --run basic-store-benchmark IGT-Version: 1.28-g2ed908c0b (x86_64) (Linux: 6.9.0-xe+ x86_64) Using IGT_SRANDOM=1718889593 for randomisation Opened device: /dev/dri/card0 Starting subtest: basic-store-benchmark Starting dynamic subtest: WC Dynamic subtest WC: SUCCESS (0.000s) Time taken for size SZ_4K: 4479 us Time taken for size SZ_2M: 3291 us Time taken for size SZ_64M: 6595 us Time taken for size SZ_128M: 9069 us Time taken for size SZ_256M: 12681 us Time taken for size SZ_1G: 41806 us Starting dynamic subtest: WB Dynamic subtest WB: SUCCESS (0.000s) Time taken for size SZ_4K: 3317 us Time taken for size SZ_2M: 6458 us Time taken for size SZ_64M: 12802 us Time taken for size SZ_128M: 19579 us Time taken for size SZ_256M: 38768 us Time taken for size SZ_1G: 143250 us Subtest basic-store-benchmark: SUCCESS (0.328s) Cc: Christian Koenig <christian.koenig@xxxxxxx> Cc: "Thomas Hellström" <thomas.hellstrom@xxxxxxxxxxxxxxx> Cc: Matthew Auld <matthew.auld@xxxxxxxxx> Signed-off-by: Nirmoy Das <nirmoy.das@xxxxxxxxx> --- drivers/gpu/drm/xe/xe_bo.c | 4 ++++ drivers/gpu/drm/xe/xe_device.c | 36 +++++++++++++++++++++------- drivers/gpu/drm/xe/xe_device_types.h | 2 ++ drivers/gpu/drm/xe/xe_migrate.c | 6 ++--- 4 files changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 65c696966e96..10ec02412dc4 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -387,6 +387,10 @@ static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo, caching = ttm_uncached; } + /* Clear TTM_TT_FLAG_ZERO_ALLOC when GPU is set to clear pages */ + if (xe->mem.gpu_page_clear) + page_flags &= ~TTM_TT_FLAG_ZERO_ALLOC; + err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, extra_pages); if (err) { kfree(tt); diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 75d4c8ae9234..8e8d54c59aae 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -240,8 +240,6 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy) if (xe->unordered_wq) destroy_workqueue(xe->unordered_wq); - - ttm_device_fini(&xe->ttm); } struct xe_device *xe_device_create(struct pci_dev *pdev, @@ -260,12 +258,6 @@ struct xe_device *xe_device_create(struct pci_dev *pdev, if (IS_ERR(xe)) return xe; - err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev, - xe->drm.anon_inode->i_mapping, - xe->drm.vma_offset_manager, false, false); - if (WARN_ON(err)) - goto err; - err = drmm_add_action_or_reset(&xe->drm, xe_device_destroy, NULL); if (err) goto err; @@ -543,6 +535,13 @@ static int xe_device_set_has_flat_ccs(struct xe_device *xe) return xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); } +static void xe_device_destroy_ttm_device(struct drm_device *dev, void *dummy) +{ + struct xe_device *xe = to_xe_device(dev); + + ttm_device_fini(&xe->ttm); +} + int xe_device_probe(struct xe_device *xe) { struct xe_tile *tile; @@ -550,6 +549,7 @@ int xe_device_probe(struct xe_device *xe) int err; u8 last_gt; u8 id; + unsigned int ttm_pool_flags = 0; xe_pat_init_early(xe); @@ -572,6 +572,26 @@ int xe_device_probe(struct xe_device *xe) xe_ttm_sys_mgr_init(xe); + /* On iGFX device with flat CCS we clear CCS metadata, let's extend that + * and use GPU to clear pages as well. + */ + if (xe_device_has_flat_ccs(xe) && !IS_DGFX(xe)) { + ttm_pool_flags = TTM_POOL_FLAG_SKIP_CLEAR_ON_FREE; + xe->mem.gpu_page_clear = true; + } + + err = ttm_device_init_with_pool_flags(&xe->ttm, &xe_ttm_funcs, + xe->drm.dev, + xe->drm.anon_inode->i_mapping, + xe->drm.vma_offset_manager, + false, false, ttm_pool_flags); + if (WARN_ON(err)) + return err; + + err = drmm_add_action_or_reset(&xe->drm, xe_device_destroy_ttm_device, NULL); + if (err) + return err; + for_each_gt(gt, xe, id) { err = xe_gt_init_early(gt); if (err) diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index c37be471d11c..ece68c6f3668 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -325,6 +325,8 @@ struct xe_device { struct xe_mem_region vram; /** @mem.sys_mgr: system TTM manager */ struct ttm_resource_manager sys_mgr; + /** @gpu_page_clear: clear pages offloaded to GPU */ + bool gpu_page_clear; } mem; /** @sriov: device level virtualization data */ diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 05f933787860..0023f32d147d 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1003,6 +1003,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, struct xe_gt *gt = m->tile->primary_gt; struct xe_device *xe = gt_to_xe(gt); bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) && !IS_DGFX(xe)) ? true : false; + bool clear_on_create = xe->mem.gpu_page_clear; struct dma_fence *fence = NULL; u64 size = bo->size; struct xe_res_cursor src_it; @@ -1022,7 +1023,6 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, struct xe_sched_job *job; struct xe_bb *bb; u32 batch_size, update_idx; - bool usm = xe->info.has_usm; u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; @@ -1032,7 +1032,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, batch_size = 2 + pte_update_size(m, clear_vram, src, &src_it, &clear_L0, &clear_L0_ofs, &clear_L0_pt, - clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0, + !clear_on_create ? 0 : emit_clear_cmd_len(gt), 0, avail_pts); if (xe_device_has_flat_ccs(xe)) @@ -1060,7 +1060,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, bb->cs[bb->len++] = MI_BATCH_BUFFER_END; update_idx = bb->len; - if (!clear_system_ccs) + if (clear_on_create) emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram); if (xe_device_has_flat_ccs(xe)) { -- 2.42.0