Re: [PATCH] drm/amdgpu: allocate entities on demand

Christian König <christian.koenig@xxxxxxx> · Fri, 24 Jan 2020 13:09:42 +0100

Am 24.01.20 um 12:53 schrieb Nirmoy Das:
Currently we pre-allocate entities and fences for all the HW IPs on
context creation and some of which are might never be used.

This patch tries to resolve entity/fences wastage by creating entity
only when needed.

Signed-off-by: Nirmoy Das <nirmoy.das@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 244 +++++++++++++-----------
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |   2 +-
  2 files changed, 135 insertions(+), 111 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 05c2af61e7de..73f7615df8c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -42,19 +42,12 @@ const unsigned int amdgpu_ctx_num_entities[AMDGPU_HW_IP_NUM] = {
  	[AMDGPU_HW_IP_VCN_JPEG]	=	1,
  };
  
-static int amdgpu_ctx_total_num_entities(void)
-{
-	unsigned i, num_entities = 0;
-
-	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i)
-		num_entities += amdgpu_ctx_num_entities[i];
-
-	return num_entities;
-}
-
  static int amdgpu_ctx_priority_permit(struct drm_file *filp,
  				      enum drm_sched_priority priority)
  {
+	if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
+		return -EINVAL;
+
  	/* NORMAL and below are accessible by everyone */
  	if (priority <= DRM_SCHED_PRIORITY_NORMAL)
  		return 0;
@@ -68,64 +61,44 @@ static int amdgpu_ctx_priority_permit(struct drm_file *filp,
  	return -EACCES;
  }
  
-static int amdgpu_ctx_init(struct amdgpu_device *adev,
-			   enum drm_sched_priority priority,
-			   struct drm_file *filp,
-			   struct amdgpu_ctx *ctx)
+static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, const u32 hw_ip, const u32 ring)
  {
-	unsigned num_entities = amdgpu_ctx_total_num_entities();
-	unsigned i, j;
+	struct amdgpu_device *adev = ctx->adev;
+	struct amdgpu_ctx_entity *entity;
+	struct drm_gpu_scheduler **scheds;
+	struct drm_gpu_scheduler *sched;
+	unsigned num_scheds = 0;
+	enum drm_sched_priority priority;
  	int r;
  
-	if (priority < 0 || priority >= DRM_SCHED_PRIORITY_MAX)
-		return -EINVAL;
-
-	r = amdgpu_ctx_priority_permit(filp, priority);
-	if (r)
-		return r;
-
-	memset(ctx, 0, sizeof(*ctx));
-	ctx->adev = adev;
-
-
-	ctx->entities[0] = kcalloc(num_entities,
-				   sizeof(struct amdgpu_ctx_entity),
-				   GFP_KERNEL);
-	if (!ctx->entities[0])
-		return -ENOMEM;
-
-
-	for (i = 0; i < num_entities; ++i) {
-		struct amdgpu_ctx_entity *entity = &ctx->entities[0][i];
-
-		entity->sequence = 1;
-		entity->fences = kcalloc(amdgpu_sched_jobs,
-					 sizeof(struct dma_fence*), GFP_KERNEL);
-		if (!entity->fences) {
-			r = -ENOMEM;
-			goto error_cleanup_memory;
-		}
+	if (!ctx->entities[hw_ip]) {
+		ctx->entities[hw_ip] = kcalloc(amdgpu_ctx_num_entities[hw_ip],
+					       sizeof(struct amdgpu_ctx_entity *),
+					       GFP_KERNEL);
+		if (!ctx->entities[hw_ip])
+			return  -ENOMEM;

That's complete overkill, just statically allocate the array in the 
amdgpu_ctx structure.

The maximum instance should be 4 IIRC, so something like "struct 
amdgpu_ctx_entity *entities[AMDGPU_HW_IP_NUM][4];" so a maximum of 288 
bytes used.

Only alternative I see would be to allocate the array behind the 
structure, see dma_resv_list_alloc() for an example on how to do this. 
But I don't think that this is worth it.

Christian.

  	}
-	for (i = 1; i < AMDGPU_HW_IP_NUM; ++i)
-		ctx->entities[i] = ctx->entities[i - 1] +
-			amdgpu_ctx_num_entities[i - 1];
  
-	kref_init(&ctx->refcount);
-	spin_lock_init(&ctx->ring_lock);
-	mutex_init(&ctx->lock);
+	ctx->entities[hw_ip][ring] = kcalloc(1, sizeof(struct amdgpu_ctx_entity),
+					     GFP_KERNEL);
+	if (!ctx->entities[hw_ip][ring]) {
+		r = -ENOMEM;
+		goto error_free_entity;
+	}
  
-	ctx->reset_counter = atomic_read(&adev->gpu_reset_counter);
-	ctx->reset_counter_query = ctx->reset_counter;
-	ctx->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
-	ctx->init_priority = priority;
-	ctx->override_priority = DRM_SCHED_PRIORITY_UNSET;
+	entity = ctx->entities[hw_ip][ring];
  
-	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
-		struct drm_gpu_scheduler **scheds;
-		struct drm_gpu_scheduler *sched;
-		unsigned num_scheds = 0;
+	entity->sequence = 1;
+	entity->fences = kcalloc(amdgpu_sched_jobs,
+				 sizeof(struct dma_fence*), GFP_KERNEL);
+	if (!entity->fences) {
+		r = -ENOMEM;
+		goto error_free_entity;
+	}
  
-		switch (i) {
+	priority = (ctx->override_priority == DRM_SCHED_PRIORITY_UNSET) ?
+				ctx->init_priority : ctx->override_priority;
+	switch (hw_ip) {
  		case AMDGPU_HW_IP_GFX:
  			sched = &adev->gfx.gfx_ring[0].sched;
  			scheds = &sched;
@@ -166,57 +139,85 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
  			scheds = adev->jpeg.jpeg_sched;
  			num_scheds =  adev->jpeg.num_jpeg_sched;
  			break;
-		}
-
-		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j)
-			r = drm_sched_entity_init(&ctx->entities[i][j].entity,
-						  priority, scheds,
-						  num_scheds, &ctx->guilty);
-		if (r)
-			goto error_cleanup_entities;
  	}
  
+	r = drm_sched_entity_init(&ctx->entities[hw_ip][ring]->entity,
+				  priority, scheds,
+				  num_scheds, &ctx->guilty);
+	if (r)
+		goto error_free_entity;
+
  	return 0;
  
-error_cleanup_entities:
-	for (i = 0; i < num_entities; ++i)
-		drm_sched_entity_destroy(&ctx->entities[0][i].entity);
+error_free_entity:
+	kfree(ctx->entities[hw_ip][ring]);
+	return r;
+}
  
-error_cleanup_memory:
-	for (i = 0; i < num_entities; ++i) {
-		struct amdgpu_ctx_entity *entity = &ctx->entities[0][i];
+static int amdgpu_ctx_init(struct amdgpu_device *adev,
+			   enum drm_sched_priority priority,
+			   struct drm_file *filp,
+			   struct amdgpu_ctx *ctx)
+{
+	int r;
  
-		kfree(entity->fences);
-		entity->fences = NULL;
-	}
+	r = amdgpu_ctx_priority_permit(filp, priority);
+	if (r)
+		return r;
+
+	memset(ctx, 0, sizeof(*ctx));
+
+	ctx->adev = adev;
+
+	kref_init(&ctx->refcount);
+	spin_lock_init(&ctx->ring_lock);
+	mutex_init(&ctx->lock);
+
+	ctx->reset_counter = atomic_read(&adev->gpu_reset_counter);
+	ctx->reset_counter_query = ctx->reset_counter;
+	ctx->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
+	ctx->init_priority = priority;
+	ctx->override_priority = DRM_SCHED_PRIORITY_UNSET;
+
+	return 0;
  
-	kfree(ctx->entities[0]);
-	ctx->entities[0] = NULL;
-	return r;
  }
  
  static void amdgpu_ctx_fini(struct kref *ref)
  {
  	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
-	unsigned num_entities = amdgpu_ctx_total_num_entities();
  	struct amdgpu_device *adev = ctx->adev;
-	unsigned i, j;
+	unsigned i, j, k;
  
  	if (!adev)
  		return;
  
-	for (i = 0; i < num_entities; ++i) {
-		struct amdgpu_ctx_entity *entity = &ctx->entities[0][i];
+	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+			struct amdgpu_ctx_entity *entity;
+
+			if (!ctx->entities[i] || !ctx->entities[i][j])
+				continue;
+
+			entity = ctx->entities[i][j];
+			if (!entity->fences)
+				continue;
  
-		for (j = 0; j < amdgpu_sched_jobs; ++j)
-			dma_fence_put(entity->fences[j]);
+			for (k = 0; k < amdgpu_sched_jobs; ++k)
+				dma_fence_put(entity->fences[k]);
  
-		kfree(entity->fences);
+			kfree(entity->fences);
+			entity->fences = NULL;
+
+			kfree(entity);
+			ctx->entities[i][j] = NULL;
+		}
+
+		kfree(ctx->entities[i];
+		ctx->entities[i] = NULL;
  	}
  
-	kfree(ctx->entities[0]);
  	mutex_destroy(&ctx->lock);
-
  	kfree(ctx);
  }
  
@@ -239,7 +240,11 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
  		return -EINVAL;
  	}
  
-	*entity = &ctx->entities[hw_ip][ring].entity;
+	if ((ctx->entities[hw_ip] == NULL) || (ctx->entities[hw_ip][ring] == NULL))
+		amdgpu_ctx_init_entity(ctx, hw_ip, ring);
+
+
+	*entity = &ctx->entities[hw_ip][ring]->entity;
  	return 0;
  }
  
@@ -279,14 +284,17 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
  static void amdgpu_ctx_do_release(struct kref *ref)
  {
  	struct amdgpu_ctx *ctx;
-	unsigned num_entities;
-	u32 i;
+	u32 i, j;
  
  	ctx = container_of(ref, struct amdgpu_ctx, refcount);
+	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+			if (!ctx->entities[i] || !ctx->entities[i][j])
+				continue;
  
-	num_entities = amdgpu_ctx_total_num_entities();
-	for (i = 0; i < num_entities; i++)
-		drm_sched_entity_destroy(&ctx->entities[0][i].entity);
+			drm_sched_entity_destroy(&ctx->entities[i][j]->entity);
+		}
+	}
  
  	amdgpu_ctx_fini(ref);
  }
@@ -516,19 +524,23 @@ struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
  void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
  				  enum drm_sched_priority priority)
  {
-	unsigned num_entities = amdgpu_ctx_total_num_entities();
  	enum drm_sched_priority ctx_prio;
-	unsigned i;
+	unsigned i, j;
  
  	ctx->override_priority = priority;
  
  	ctx_prio = (ctx->override_priority == DRM_SCHED_PRIORITY_UNSET) ?
  			ctx->init_priority : ctx->override_priority;
+	for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+		for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+			struct drm_sched_entity *entity;
  
-	for (i = 0; i < num_entities; i++) {
-		struct drm_sched_entity *entity = &ctx->entities[0][i].entity;
+			if (!ctx->entities[i] || !ctx->entities[i][j])
+				continue;
  
-		drm_sched_entity_set_priority(entity, ctx_prio);
+			entity = &ctx->entities[i][j]->entity;
+			drm_sched_entity_set_priority(entity, ctx_prio);
+		}
  	}
  }
  
@@ -564,20 +576,24 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
  
  long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
  {
-	unsigned num_entities = amdgpu_ctx_total_num_entities();
  	struct amdgpu_ctx *ctx;
  	struct idr *idp;
-	uint32_t id, i;
+	uint32_t id, i, j;
  
  	idp = &mgr->ctx_handles;
  
  	mutex_lock(&mgr->lock);
  	idr_for_each_entry(idp, ctx, id) {
-		for (i = 0; i < num_entities; i++) {
-			struct drm_sched_entity *entity;
+		for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+			for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+				struct drm_sched_entity *entity;
+
+				if (!ctx->entities[i] || !ctx->entities[i][j])
+					continue;
  
-			entity = &ctx->entities[0][i].entity;
-			timeout = drm_sched_entity_flush(entity, timeout);
+				entity = &ctx->entities[i][j]->entity;
+				timeout = drm_sched_entity_flush(entity, timeout);
+			}
  		}
  	}
  	mutex_unlock(&mgr->lock);
@@ -586,10 +602,9 @@ long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
  
  void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
  {
-	unsigned num_entities = amdgpu_ctx_total_num_entities();
  	struct amdgpu_ctx *ctx;
  	struct idr *idp;
-	uint32_t id, i;
+	uint32_t id, i, j;
  
  	idp = &mgr->ctx_handles;
  
@@ -599,8 +614,17 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
  			continue;
  		}
  
-		for (i = 0; i < num_entities; i++)
-			drm_sched_entity_fini(&ctx->entities[0][i].entity);
+		for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+			for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+				struct drm_sched_entity *entity;
+
+				if (!ctx->entities[i] || !ctx->entities[i][j])
+					continue;
+
+				entity = &ctx->entities[i][j]->entity;
+				drm_sched_entity_fini(entity);
+			}
+		}
  	}
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index a6cd9d4b078c..ca9363e71df5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -42,7 +42,7 @@ struct amdgpu_ctx {
  	unsigned			reset_counter_query;
  	uint32_t			vram_lost_counter;
  	spinlock_t			ring_lock;
-	struct amdgpu_ctx_entity	*entities[AMDGPU_HW_IP_NUM];
+	struct amdgpu_ctx_entity	**entities[AMDGPU_HW_IP_NUM];
  	bool				preamble_presented;
  	enum drm_sched_priority		init_priority;
  	enum drm_sched_priority		override_priority;

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx