[PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

ckoenig.leichtzumerken@xxxxxxxxx (Christian König) · Mon, 23 Oct 2017 14:42:36 +0200

Am 20.10.2017 um 15:32 schrieb Andrey Grodzovsky:
> Switch from kfifo to SPSC queue.
>
> Bug:
> Kfifo is limited at size, during GPU reset it would fill up to limit
> and the pushing thread (producer) would wait for the scheduler worker to
> consume the items in the fifo while holding reservation lock
> on a BO. The gpu reset thread on the other hand blocks the scheduler
> during reset. Before it unblocks the sceduler it might want
> to recover VRAM and so will try to reserve the same BO the producer
> thread is already holding creating a deadlock.
>
> Fix:
> Switch from kfifo to SPSC queue which is unlimited in size.
>
> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky at amd.com>
> ---
>   drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
>   3 files changed, 26 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> index 8bd3810..86838a8 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
>   			   __entry->id = sched_job->id;
>   			   __entry->fence = &sched_job->s_fence->finished;
>   			   __entry->name = sched_job->sched->name;
> -			   __entry->job_count = kfifo_len(
> -				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
> +			   __entry->job_count = spsc_queue_count(
> +				   &sched_job->s_entity->job_queue);
>   			   __entry->hw_job_count = atomic_read(
>   				   &sched_job->sched->hw_rq_count);
>   			   ),
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 1bbbce2..0c9cdc0 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -28,9 +28,14 @@
>   #include <drm/drmP.h>
>   #include "gpu_scheduler.h"
>   
> +#include "spsc_queue.h"
> +
>   #define CREATE_TRACE_POINTS
>   #include "gpu_sched_trace.h"
>   
> +#define to_amd_sched_job(sched_job)		\
> +		container_of((sched_job), struct amd_sched_job, queue_node)
> +
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);
>   static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);
>   static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
> @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>   			  struct amd_sched_rq *rq,
>   			  uint32_t jobs)
>   {
> -	int r;
> -
>   	if (!(sched && entity && rq))
>   		return -EINVAL;
>   
> @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>   
>   	spin_lock_init(&entity->rq_lock);
>   	spin_lock_init(&entity->queue_lock);
> -	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
> -	if (r)
> -		return r;
> +	spsc_queue_init(&entity->job_queue);
>   
>   	atomic_set(&entity->fence_seq, 0);
>   	entity->fence_context = dma_fence_context_alloc(2);
> @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,
>   static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
>   {
>   	rmb();
> -	if (kfifo_is_empty(&entity->job_queue))
> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>   		return true;
>   
>   	return false;
> @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
>    */
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)
>   {
> -	if (kfifo_is_empty(&entity->job_queue))
> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>   		return false;
>   
>   	if (ACCESS_ONCE(entity->dependency))
> @@ -227,7 +228,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>   		 */
>   		kthread_park(sched->thread);
>   		kthread_unpark(sched->thread);
> -		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
> +		while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue)))) {
>   			struct amd_sched_fence *s_fence = job->s_fence;
>   			amd_sched_fence_scheduled(s_fence);
>   			dma_fence_set_error(&s_fence->finished, -ESRCH);
> @@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>   			dma_fence_put(&s_fence->finished);
>   			sched->ops->free_job(job);
>   		}
> -
>   	}
> -	kfifo_free(&entity->job_queue);
>   }
>   
>   static void amd_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
> @@ -332,18 +331,21 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)
>   }
>   
>   static struct amd_sched_job *
> -amd_sched_entity_peek_job(struct amd_sched_entity *entity)
> +amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>   {
>   	struct amd_gpu_scheduler *sched = entity->sched;
> -	struct amd_sched_job *sched_job;
> +	struct amd_sched_job *sched_job = to_amd_sched_job(
> +						spsc_queue_peek(&entity->job_queue));
>   
> -	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
> +	if (!sched_job)
>   		return NULL;
>   
>   	while ((entity->dependency = sched->ops->dependency(sched_job)))
>   		if (amd_sched_entity_add_dependency_cb(entity))
>   			return NULL;
>   
> +	sched_job->s_entity = NULL;
> +	spsc_queue_pop(&entity->job_queue);
>   	return sched_job;
>   }
>   
> @@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>    *
>    * Returns true if we could submit the job.
>    */
> -static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
> +static void amd_sched_entity_in(struct amd_sched_job *sched_job)
>   {
>   	struct amd_gpu_scheduler *sched = sched_job->sched;
>   	struct amd_sched_entity *entity = sched_job->s_entity;
> -	bool added, first = false;
> +	bool first = false;
>   
>   	spin_lock(&entity->queue_lock);
> -	added = kfifo_in(&entity->job_queue, &sched_job,
> -			sizeof(sched_job)) == sizeof(sched_job);
> -
> -	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
> -		first = true;
> +	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>   
>   	spin_unlock(&entity->queue_lock);
>   
> @@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>   		spin_unlock(&entity->rq_lock);
>   		amd_sched_wakeup(sched);
>   	}
> -	return added;
>   }
>   
>   /* job_finish is called after hw fence signaled
> @@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>    */
>   void amd_sched_entity_push_job(struct amd_sched_job *sched_job)
>   {
> -	struct amd_sched_entity *entity = sched_job->s_entity;
> -
>   	trace_amd_sched_job(sched_job);
> -	wait_event(entity->sched->job_scheduled,
> -		   amd_sched_entity_in(sched_job));
> +	amd_sched_entity_in(sched_job);

To save a few loc amd_sched_entity_in() can now be completely merged 
into this function.

Apart from that the patch looks really good to me and is Reviewed-by: 
Christian KÃ¶nig <christian.koenig at amd.com>.

Ping me internally for new work if you now have time again.

Thanks,
Christian.

>   }
>   
>   /* init a sched_job with basic field */
> @@ -611,7 +605,7 @@ static int amd_sched_main(void *param)
>   {
>   	struct sched_param sparam = {.sched_priority = 1};
>   	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
> -	int r, count;
> +	int r;
>   
>   	sched_setscheduler(current, SCHED_FIFO, &sparam);
>   
> @@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
>   		if (!entity)
>   			continue;
>   
> -		sched_job = amd_sched_entity_peek_job(entity);
> +		sched_job = amd_sched_entity_pop_job(entity);
>   		if (!sched_job)
>   			continue;
>   
> @@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   
> -		count = kfifo_out(&entity->job_queue, &sched_job,
> -				sizeof(sched_job));
> -		WARN_ON(count != sizeof(sched_job));
>   		wake_up(&sched->job_scheduled);
>   	}
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 3f75b45..8844ce7 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -26,6 +26,7 @@
>   
>   #include <linux/kfifo.h>
>   #include <linux/dma-fence.h>
> +#include "spsc_queue.h"
>   
>   struct amd_gpu_scheduler;
>   struct amd_sched_rq;
> @@ -56,7 +57,7 @@ struct amd_sched_entity {
>   	struct amd_gpu_scheduler	*sched;
>   
>   	spinlock_t			queue_lock;
> -	struct kfifo                    job_queue;
> +	struct spsc_queue	job_queue;
>   
>   	atomic_t			fence_seq;
>   	uint64_t                        fence_context;
> @@ -87,6 +88,7 @@ struct amd_sched_fence {
>   };
>   
>   struct amd_sched_job {
> +	struct spsc_node queue_node;
>   	struct amd_gpu_scheduler        *sched;
>   	struct amd_sched_entity         *s_entity;
>   	struct amd_sched_fence          *s_fence;