Re: [RFC 03/14] drm/sched: Implement RR via FIFO

Danilo Krummrich <dakr@xxxxxxxxxx> · Wed, 8 Jan 2025 10:42:04 +0100

On Mon, Dec 30, 2024 at 04:52:48PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxx>
> 
> Round-robin being the non-default policy and unclear how much it is used,

I don't know either, but would be nice if we could instead remove it.

> we can notice that it can be implemented using the FIFO data structures if
> we only invent a fake submit timestamp which is monotonically increasing
> inside drm_sched_rq instances.
> 
> So instead of remembering which was the last entity the scheduler worker
> picked, we can bump the picked one to the bottom of the tree, achieving
> the same round-robin behaviour.
> 
> Advantage is that we can consolidate to a single code path and remove a
> bunch of code. Downside is round-robin mode now needs to lock on the job
> pop path but that should not be visible.

I guess you did you test both scheduler strategies with this change?

> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxx>
> Cc: Christian König <christian.koenig@xxxxxxx>
> Cc: Danilo Krummrich <dakr@xxxxxxxxxx>
> Cc: Matthew Brost <matthew.brost@xxxxxxxxx>
> Cc: Philipp Stanner <pstanner@xxxxxxxxxx>
> ---
>  drivers/gpu/drm/scheduler/sched_entity.c | 53 ++++++++-----
>  drivers/gpu/drm/scheduler/sched_main.c   | 99 +++---------------------
>  include/drm/gpu_scheduler.h              |  5 +-
>  3 files changed, 48 insertions(+), 109 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> index 8e910586979e..cb5f596b48b7 100644
> --- a/drivers/gpu/drm/scheduler/sched_entity.c
> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> @@ -473,9 +473,20 @@ drm_sched_job_dependency(struct drm_sched_job *job,
>  	return NULL;
>  }
>  
> +static ktime_t
> +drm_sched_rq_get_rr_deadline(struct drm_sched_rq *rq)
> +{
> +	lockdep_assert_held(&rq->lock);
> +
> +	rq->rr_deadline = ktime_add_ns(rq->rr_deadline, 1);
> +
> +	return rq->rr_deadline;
> +}
> +
>  struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>  {
> -	struct drm_sched_job *sched_job;
> +	struct drm_sched_job *sched_job, *next_job;
> +	struct drm_sched_rq *rq;
>  
>  	sched_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
>  	if (!sched_job)
> @@ -510,24 +521,28 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>  	 * Update the entity's location in the min heap according to
>  	 * the timestamp of the next job, if any.
>  	 */
> -	if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) {
> -		struct drm_sched_job *next;
> -		struct drm_sched_rq *rq;
> +	next_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
>  
> -		spin_lock(&entity->lock);
> -		rq = entity->rq;
> -		spin_lock(&rq->lock);
> -		next = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
> -		if (next) {
> -			drm_sched_rq_update_fifo_locked(entity, rq,
> -							next->submit_ts);
> -		} else {
> -			drm_sched_rq_remove_fifo_locked(entity, rq);
> -		}
> -		spin_unlock(&rq->lock);
> -		spin_unlock(&entity->lock);
> +	spin_lock(&entity->lock);
> +	rq = entity->rq;
> +	spin_lock(&rq->lock);
> +
> +	if (next_job) {
> +		ktime_t ts;
> +
> +		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
> +			ts = next_job->submit_ts;
> +		else
> +			ts = drm_sched_rq_get_rr_deadline(rq);
> +
> +		drm_sched_rq_update_fifo_locked(entity, rq, ts);
> +	} else {
> +		drm_sched_rq_remove_fifo_locked(entity, rq);
>  	}
>  
> +	spin_unlock(&rq->lock);
> +	spin_unlock(&entity->lock);
> +
>  	/* Jobs and entities might have different lifecycles. Since we're
>  	 * removing the job from the entities queue, set the jobs entity pointer
>  	 * to NULL to prevent any future access of the entity through this job.
> @@ -624,9 +639,9 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>  
>  		spin_lock(&rq->lock);
>  		drm_sched_rq_add_entity(rq, entity);
> -
> -		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
> -			drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
> +		if (drm_sched_policy == DRM_SCHED_POLICY_RR)
> +			submit_ts = drm_sched_rq_get_rr_deadline(rq);
> +		drm_sched_rq_update_fifo_locked(entity, rq, submit_ts);
>  
>  		spin_unlock(&rq->lock);
>  		spin_unlock(&entity->lock);
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 9beb4c611988..eb22b1b7de36 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -189,7 +189,6 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
>  	spin_lock_init(&rq->lock);
>  	INIT_LIST_HEAD(&rq->entities);
>  	rq->rb_tree_root = RB_ROOT_CACHED;
> -	rq->current_entity = NULL;
>  	rq->sched = sched;
>  }
>  
> @@ -235,82 +234,13 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>  	atomic_dec(rq->sched->score);
>  	list_del_init(&entity->list);
>  
> -	if (rq->current_entity == entity)
> -		rq->current_entity = NULL;
> -
> -	if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
> -		drm_sched_rq_remove_fifo_locked(entity, rq);
> +	drm_sched_rq_remove_fifo_locked(entity, rq);
>  
>  	spin_unlock(&rq->lock);
>  }
>  
>  /**
> - * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
> - *
> - * @sched: the gpu scheduler
> - * @rq: scheduler run queue to check.
> - *
> - * Try to find the next ready entity.
> - *
> - * Return an entity if one is found; return an error-pointer (!NULL) if an
> - * entity was ready, but the scheduler had insufficient credits to accommodate
> - * its job; return NULL, if no ready entity was found.
> - */
> -static struct drm_sched_entity *
> -drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
> -			      struct drm_sched_rq *rq)
> -{
> -	struct drm_sched_entity *entity;
> -
> -	spin_lock(&rq->lock);
> -
> -	entity = rq->current_entity;
> -	if (entity) {
> -		list_for_each_entry_continue(entity, &rq->entities, list) {
> -			if (drm_sched_entity_is_ready(entity)) {
> -				/* If we can't queue yet, preserve the current
> -				 * entity in terms of fairness.
> -				 */
> -				if (!drm_sched_can_queue(sched, entity)) {
> -					spin_unlock(&rq->lock);
> -					return ERR_PTR(-ENOSPC);
> -				}
> -
> -				rq->current_entity = entity;
> -				reinit_completion(&entity->entity_idle);
> -				spin_unlock(&rq->lock);
> -				return entity;
> -			}
> -		}
> -	}
> -
> -	list_for_each_entry(entity, &rq->entities, list) {
> -		if (drm_sched_entity_is_ready(entity)) {
> -			/* If we can't queue yet, preserve the current entity in
> -			 * terms of fairness.
> -			 */
> -			if (!drm_sched_can_queue(sched, entity)) {
> -				spin_unlock(&rq->lock);
> -				return ERR_PTR(-ENOSPC);
> -			}
> -
> -			rq->current_entity = entity;
> -			reinit_completion(&entity->entity_idle);
> -			spin_unlock(&rq->lock);
> -			return entity;
> -		}
> -
> -		if (entity == rq->current_entity)
> -			break;
> -	}
> -
> -	spin_unlock(&rq->lock);
> -
> -	return NULL;
> -}
> -
> -/**
> - * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
> + * drm_sched_rq_select_entity - Select an entity which provides a job to run
>   *
>   * @sched: the gpu scheduler
>   * @rq: scheduler run queue to check.
> @@ -322,32 +252,29 @@ drm_sched_rq_select_entity_rr(struct drm_gpu_scheduler *sched,
>   * its job; return NULL, if no ready entity was found.
>   */
>  static struct drm_sched_entity *
> -drm_sched_rq_select_entity_fifo(struct drm_gpu_scheduler *sched,
> -				struct drm_sched_rq *rq)
> +drm_sched_rq_select_entity(struct drm_gpu_scheduler *sched,
> +			   struct drm_sched_rq *rq)
>  {
> +	struct drm_sched_entity *entity = NULL;
>  	struct rb_node *rb;
>  
>  	spin_lock(&rq->lock);
>  	for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) {
> -		struct drm_sched_entity *entity;
> -
>  		entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node);
>  		if (drm_sched_entity_is_ready(entity)) {
> -			/* If we can't queue yet, preserve the current entity in
> -			 * terms of fairness.
> -			 */
>  			if (!drm_sched_can_queue(sched, entity)) {
> -				spin_unlock(&rq->lock);
> -				return ERR_PTR(-ENOSPC);
> +				entity = ERR_PTR(-ENOSPC);
> +				break;
>  			}
>  
>  			reinit_completion(&entity->entity_idle);
>  			break;
>  		}
> +		entity = NULL;
>  	}
>  	spin_unlock(&rq->lock);
>  
> -	return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL;
> +	return entity;
>  }

The whole diff of drm_sched_rq_select_entity_fifo() (except for the name) has
nothing to do with the patch, does it?

If you want refactor things, please do it in a separate patch.

>  
>  /**
> @@ -1045,20 +972,18 @@ void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
>  static struct drm_sched_entity *
>  drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>  {
> -	struct drm_sched_entity *entity;
> +	struct drm_sched_entity *entity = NULL;
>  	int i;
>  
>  	/* Start with the highest priority.
>  	 */
>  	for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
> -		entity = drm_sched_policy == DRM_SCHED_POLICY_FIFO ?
> -			drm_sched_rq_select_entity_fifo(sched, sched->sched_rq[i]) :
> -			drm_sched_rq_select_entity_rr(sched, sched->sched_rq[i]);
> +		entity = drm_sched_rq_select_entity(sched, sched->sched_rq[i]);
>  		if (entity)
>  			break;
>  	}
>  
> -	return IS_ERR(entity) ? NULL : entity;
> +	return IS_ERR(entity) ? NULL : entity;;
>  }
>  
>  /**
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 978ca621cc13..db65600732b9 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -245,8 +245,7 @@ struct drm_sched_entity {
>   * struct drm_sched_rq - queue of entities to be scheduled.
>   *
>   * @sched: the scheduler to which this rq belongs to.
> - * @lock: protects @entities, @rb_tree_root and @current_entity.
> - * @current_entity: the entity which is to be scheduled.
> + * @lock: protects @entities, @rb_tree_root and @rr_deadline.
>   * @entities: list of the entities to be scheduled.
>   * @rb_tree_root: root of time based priority queue of entities for FIFO scheduling
>   *
> @@ -259,7 +258,7 @@ struct drm_sched_rq {
>  
>  	spinlock_t			lock;
>  	/* Following members are protected by the @lock: */
> -	struct drm_sched_entity		*current_entity;
> +	ktime_t				rr_deadline;
>  	struct list_head		entities;
>  	struct rb_root_cached		rb_tree_root;
>  };
> -- 
> 2.47.1
>