When memory is allocated using
__GFP_THISNODE flag, memory allocations will predominantly be done on
the local node, consequency, the shrinkers may priotitize reclaiming
memory from caches assocoated with local node to maintain memory
locality and minimize latency, thereby provide better shinker targeting.
Reduced memory pressure on remote nodes, can also indirectly influence
shrinker behavior by potentially reducing the frequency and intensity of
memory reclamation operation on remote nodes and could provide improved
overall system performance.
While this change could be more beneficial in general, i.e., without the
use of a module parameter, but in absence of widespread testing, limit
it to the AMD GFXIP 9.4.3 based ttm pool initializations only.
Cc: Joe Greathouse <joseph.greathouse@xxxxxxx>
Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 ++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 7 ++++++-
drivers/gpu/drm/ttm/tests/ttm_pool_test.c | 10 +++++-----
drivers/gpu/drm/ttm/ttm_device.c | 2 +-
drivers/gpu/drm/ttm/ttm_pool.c | 7 ++++++-
include/drm/ttm/ttm_pool.h | 4 +++-
7 files changed, 30 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9c62552bec34..96532cfc6230 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -253,6 +253,7 @@ extern int amdgpu_user_partt_mode;
extern int amdgpu_agp;
extern int amdgpu_wbrf;
+extern bool strict_numa_alloc;
#define AMDGPU_VM_MAX_NUM_CTX 4096
#define AMDGPU_SG_THRESHOLD (256*1024*1024)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 80b9642f2bc4..a183a6b4493d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -781,6 +781,14 @@ int queue_preemption_timeout_ms = 9000;
module_param(queue_preemption_timeout_ms, int, 0644);
MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption
timeout in ms (1 = Minimum, 9000 = default)");
+/**
+ * DOC: strict_numa_alloc(bool)
+ * Policy to force NUMA allocation requests from the proximity NUMA
domain only.
+ */
+bool strict_numa_alloc;
+module_param(strict_numa_alloc, bool, 0444);
+MODULE_PARM_DESC(strict_numa_alloc, "Force NUMA allocation requests
to be satisfied from the closest node only (false = default)");
+
/**
* DOC: debug_evictions(bool)
* Enable extra debug messages to help determine the cause of
evictions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index b0ed10f4de60..a9f78f85e28c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1768,6 +1768,7 @@ static int amdgpu_ttm_reserve_tmr(struct
amdgpu_device *adev)
static int amdgpu_ttm_pools_init(struct amdgpu_device *adev)
{
+ bool policy = true;
int i;
if (!adev->gmc.is_app_apu || !adev->gmc.num_mem_partitions)
@@ -1779,11 +1780,15 @@ static int amdgpu_ttm_pools_init(struct
amdgpu_device *adev)
if (!adev->mman.ttm_pools)
return -ENOMEM;
+ /* Policy not only depends on the module param but also on the
ASIC
+ * setting use_strict_numa_alloc as well.
+ */
for (i = 0; i < adev->gmc.num_mem_partitions; i++) {
ttm_pool_init(&adev->mman.ttm_pools[i], adev->dev,
adev->gmc.mem_partitions[i].numa.node,
- false, false);
+ false, false, policy && strict_numa_alloc);
}
+
return 0;
}
diff --git a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
index 2d9cae8cd984..6ff47aac570a 100644
--- a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
+++ b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
@@ -87,7 +87,7 @@ static struct ttm_pool
*ttm_pool_pre_populated(struct kunit *test,
pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test, pool);
- ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false);
+ ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false, false);
err = ttm_pool_alloc(pool, tt, &simple_ctx);
KUNIT_ASSERT_EQ(test, err, 0);
@@ -152,7 +152,7 @@ static void ttm_pool_alloc_basic(struct kunit *test)
KUNIT_ASSERT_NOT_NULL(test, pool);
ttm_pool_init(pool, devs->dev, NUMA_NO_NODE,
params->use_dma_alloc,
- false);
+ false, false);
KUNIT_ASSERT_PTR_EQ(test, pool->dev, devs->dev);
KUNIT_ASSERT_EQ(test, pool->nid, NUMA_NO_NODE);
@@ -219,7 +219,7 @@ static void ttm_pool_alloc_basic_dma_addr(struct
kunit *test)
pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test, pool);
- ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false);
+ ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false, false);
err = ttm_pool_alloc(pool, tt, &simple_ctx);
KUNIT_ASSERT_EQ(test, err, 0);
@@ -349,7 +349,7 @@ static void ttm_pool_free_dma_alloc(struct kunit
*test)
pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test, pool);
- ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false);
+ ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, true, false, false);
ttm_pool_alloc(pool, tt, &simple_ctx);
pt = &pool->caching[caching].orders[order];
@@ -380,7 +380,7 @@ static void ttm_pool_free_no_dma_alloc(struct
kunit *test)
pool = kunit_kzalloc(test, sizeof(*pool), GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test, pool);
- ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, false, false);
+ ttm_pool_init(pool, devs->dev, NUMA_NO_NODE, false, false, false);
ttm_pool_alloc(pool, tt, &simple_ctx);
pt = &pool->caching[caching].orders[order];
diff --git a/drivers/gpu/drm/ttm/ttm_device.c
b/drivers/gpu/drm/ttm/ttm_device.c
index f5187b384ae9..540e8a44f015 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -215,7 +215,7 @@ int ttm_device_init(struct ttm_device *bdev,
const struct ttm_device_funcs *func
ttm_sys_man_init(bdev);
- ttm_pool_init(&bdev->pool, dev, dev_to_node(dev),
use_dma_alloc, use_dma32);
+ ttm_pool_init(&bdev->pool, dev, dev_to_node(dev), use_dma_alloc,
use_dma32, false);
bdev->vma_manager = vma_manager;
spin_lock_init(&bdev->lru_lock);
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c
b/drivers/gpu/drm/ttm/ttm_pool.c
index dbc96984d331..73aafd06c361 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -447,6 +447,9 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct
ttm_tt *tt,
else
gfp_flags |= GFP_HIGHUSER;
+ if (pool->use_strict_numa_alloc)
+ gfp_flags |= __GFP_THISNODE;
+
for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages));
num_pages;
order = min_t(unsigned int, order, __fls(num_pages))) {
@@ -555,7 +558,8 @@ EXPORT_SYMBOL(ttm_pool_free);
* Initialize the pool and its pool types.
*/
void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
- int nid, bool use_dma_alloc, bool use_dma32)
+ int nid, bool use_dma_alloc, bool use_dma32,
+ bool use_strict_numa_alloc)
{
unsigned int i, j;
@@ -565,6 +569,7 @@ void ttm_pool_init(struct ttm_pool *pool,
struct device *dev,
pool->nid = nid;
pool->use_dma_alloc = use_dma_alloc;
pool->use_dma32 = use_dma32;
+ pool->use_strict_numa_alloc = use_strict_numa_alloc;
if (use_dma_alloc || nid != NUMA_NO_NODE) {
for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
index 30a347e5aa11..6b7bdc952466 100644
--- a/include/drm/ttm/ttm_pool.h
+++ b/include/drm/ttm/ttm_pool.h
@@ -72,6 +72,7 @@ struct ttm_pool {
bool use_dma_alloc;
bool use_dma32;
+ bool use_strict_numa_alloc;
struct {
struct ttm_pool_type orders[MAX_ORDER + 1];
@@ -83,7 +84,8 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct
ttm_tt *tt,
void ttm_pool_free(struct ttm_pool *pool, struct ttm_tt *tt);
void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
- int nid, bool use_dma_alloc, bool use_dma32);
+ int nid, bool use_dma_alloc, bool use_dma32,
+ bool use_strict_numa_alloc);
void ttm_pool_fini(struct ttm_pool *pool);
int ttm_pool_debugfs(struct ttm_pool *pool, struct seq_file *m);