Re: [PATCH v3] drm/sched: Add FIFO sched policy to run queue v3

Luben Tuikov <luben.tuikov@xxxxxxx> · Wed, 21 Sep 2022 05:08:42 -0400

Inlined:

On 2022-09-20 15:16, Andrey Grodzovsky wrote:
> 
> On 2022-09-19 23:11, Luben Tuikov wrote:
>> Please run this patch through checkpatch.pl, as it shows
>> 12 warnings with it. Use these command line options:
>> "--strict --show-types".
>>
>> Inlined:
>>
>> On 2022-09-13 16:40, Andrey Grodzovsky wrote:
>>> Given many entities competing for same run queue on
>>> the same scheduler and unacceptably long wait time for some
>>> jobs waiting stuck in the run queue before being picked up are
>>> observed (seen using  GPUVis).
>> Since the second part of this sentence is the result of the first,
>> I'd say something like "When many entities ... we see unacceptably long ...".
>>
>>> The issue is due to the Round Robin policy used by schedulers
>>> to pick up the next entity's job queue for execution. Under stress
>>> of many entities and long job queus within entity some
>> Spelling: "queues".
>>
>>> jobs could be stack for very long time in it's entity's
>> "stuck", not "stack".
>>
>>> queue before being popped from the queue and executed
>>> while for other entities with smaller job queues a job
>>> might execute earlier even though that job arrived later
>>> then the job in the long queue.
>> "than".
>>
>>>     
>>> Fix:
>>> Add FIFO selection policy to entities in run queue, chose next entity
>>> on run queue in such order that if job on one entity arrived
>>> earlier then job on another entity the first job will start
>>> executing earlier regardless of the length of the entity's job
>>> queue.
>>>     
>>> v2:
>>> Switch to rb tree structure for entities based on TS of
>>> oldest job waiting in the job queue of an entity. Improves next
>>> entity extraction to O(1). Entity TS update
>>> O(log N) where N is the number of entities in the run-queue
>>>     
>>> Drop default option in module control parameter.
>>>
>>> v3:
>>> Various cosmetical fixes and minor refactoring of fifo update function.
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>
>>> Tested-by: Li Yunxiang (Teddy) <Yunxiang.Li@xxxxxxx>
>>> ---
>>>   drivers/gpu/drm/scheduler/sched_entity.c |  26 ++++-
>>>   drivers/gpu/drm/scheduler/sched_main.c   | 132 ++++++++++++++++++++++-
>>>   include/drm/gpu_scheduler.h              |  35 ++++++
>>>   3 files changed, 187 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
>>> index 6b25b2f4f5a3..f3ffce3c9304 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_entity.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c
>>> @@ -73,6 +73,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity,
>>>   	entity->priority = priority;
>>>   	entity->sched_list = num_sched_list > 1 ? sched_list : NULL;
>>>   	entity->last_scheduled = NULL;
>>> +	RB_CLEAR_NODE(&entity->rb_tree_node);
>>>   
>>>   	if(num_sched_list)
>>>   		entity->rq = &sched_list[0]->sched_rq[entity->priority];
>>> @@ -417,14 +418,16 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>>>   
>>>   	sched_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
>>>   	if (!sched_job)
>>> -		return NULL;
>>> +		goto skip;
>>>   
>>>   	while ((entity->dependency =
>>>   			drm_sched_job_dependency(sched_job, entity))) {
>>>   		trace_drm_sched_job_wait_dep(sched_job, entity->dependency);
>>>   
>>> -		if (drm_sched_entity_add_dependency_cb(entity))
>>> -			return NULL;
>>> +		if (drm_sched_entity_add_dependency_cb(entity)) {
>>> +			sched_job = NULL;
>>> +			goto skip;
>>> +		}
>>>   	}
>>>   
>>>   	/* skip jobs from entity that marked guilty */
>>> @@ -443,6 +446,16 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
>>>   	smp_wmb();
>>>   
>>>   	spsc_queue_pop(&entity->job_queue);
>>> +
>>> +	/*
>>> +	 * It's when head job is extracted we can access the next job (or empty)
>>> +	 * queue and update the entity location in the min heap accordingly.
>>> +	 */
>>> +skip:
>>> +	if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
>>> +		drm_sched_rq_update_fifo(entity,
>>> +					 (sched_job ? sched_job->submit_ts : ktime_get()));
>>> +
>>>   	return sched_job;
>>>   }
>>>   
>>> @@ -502,11 +515,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>>>   {
>>>   	struct drm_sched_entity *entity = sched_job->entity;
>>>   	bool first;
>>> +	ktime_t ts =  ktime_get();
>>>   
>>>   	trace_drm_sched_job(sched_job, entity);
>>>   	atomic_inc(entity->rq->sched->score);
>>>   	WRITE_ONCE(entity->last_user, current->group_leader);
>>>   	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>>> +	sched_job->submit_ts = ts;
>>>   
>>>   	/* first job wakes up scheduler */
>>>   	if (first) {
>>> @@ -518,8 +533,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
>>>   			DRM_ERROR("Trying to push to a killed entity\n");
>>>   			return;
>>>   		}
>>> +
>>>   		drm_sched_rq_add_entity(entity->rq, entity);
>>>   		spin_unlock(&entity->rq_lock);
>>> +
>>> +		if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
>>> +			drm_sched_rq_update_fifo(entity, ts);
>>> +
>>>   		drm_sched_wakeup(entity->rq->sched);
>>>   	}
>>>   }
>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>>> index e5a4ecde0063..72f7105e0b16 100644
>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>> @@ -62,6 +62,65 @@
>>>   #define to_drm_sched_job(sched_job)		\
>>>   		container_of((sched_job), struct drm_sched_job, queue_node)
>>>   
>>> +int drm_sched_policy = DRM_SCHED_POLICY_RR;
>>> +
>>> +/**
>>> + * DOC: sched_policy (int)
>>> + * Used to override default entities scheduling policy in a run queue.
>>> + */
>>> +MODULE_PARM_DESC(sched_policy,
>>> +	"specify schedule policy for entities on a runqueue (DRM_SCHED_POLICY_RR = Round Robin (default) ,DRM_SCHED_POLICY_FIFO  = use FIFO");
>> " ... Robin (default) ,DRM_SCHED_POLICY_FIFO ..."
>>
>> Swap the position of the space and comma. Check with modinfo that those are correctly
>> substituted and look nice in the output.
>>
>>> +module_param_named(sched_policy, drm_sched_policy, int, 0444);
>>> +
>>> +static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a,
>>> +							     const struct rb_node *b)
>>> +{
>>> +	struct drm_sched_entity *ent_a =  rb_entry((a), struct drm_sched_entity, rb_tree_node);
>>> +	struct drm_sched_entity *ent_b =  rb_entry((b), struct drm_sched_entity, rb_tree_node);
>>> +
>>> +	return ktime_before(ent_a->oldest_job_waiting, ent_b->oldest_job_waiting);
>>> +}
>>> +
>>> +static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity)
>>> +{
>>> +	struct drm_sched_rq *rq = entity->rq;
>>> +
>>> +	if (!RB_EMPTY_NODE(&entity->rb_tree_node)) {
>>> +		rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root);
>>> +		RB_CLEAR_NODE(&entity->rb_tree_node);
>>> +	}
>>> +}
>>> +
>>> +static inline void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity,
>>> +						   ktime_t ts)
>>> +{
>>> +	struct drm_sched_rq *rq = entity->rq;
>>> +
>>> +	drm_sched_rq_remove_fifo_locked(entity);
>>> +
>>> +	entity->oldest_job_waiting = ts;
>>> +
>>> +	rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root,
>>> +			drm_sched_entity_compare_before);
>>> +}
>>> +
>>> +void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts)
>>> +{
>>> +
>>> +	/*
>>> +	 * Both locks need to be grabbed, one to protect from entity->rq change
>>> +	 * for entity from within concurrent drm_sched_entity_select_rq and the
>>> +	 * other to update the rb tree structure.
>>> +	 */
>>> +	spin_lock(&entity->rq_lock);
>>> +	spin_lock(&entity->rq->lock);
>>> +
>>> +	drm_sched_rq_update_fifo_locked(entity, ts);
>>> +
>>> +	spin_unlock(&entity->rq->lock);
>>> +	spin_unlock(&entity->rq_lock);
>>> +}
>> Here you grab both locks, but you call drm_sched_rq_update_fifo_locked()
>> from drm_sched_rq_select_entity_fifo() with only the rq->lock.
> 
> 
> It's somewhat problematic I agree but, as you can see from 
> drm_sched_rq_update_fifo
> comment we need both locks to protect from racing 
> drm_sched_entity_select_rq against us
> which can change entity->rq while we update the time stamp. For the 
> particular case of
> drm_sched_rq_select_entity_fifo it's ok IMHO to rely on rq->lock only 
> because we only access those entities
> which are still in the rq rb_tree meaning if drm_sched_entity_select_rq 
> runs concurrently now, we grabbed the
> rq->lock before drm_sched_entity_select_rq->drm_sched_rq_remove_entity 
> executed which means
> entity->rq = rq in drm_sched_entity_select_rq didn't happen yet so we 
> should be ok

Well either make locking consistent, or explain why locking is inconsistent
by adding the explanation you wrote above into the comment of the function.
People need to know why locking is inconsistent--add that explanation above
as a comment.

If someone wants to add to the scheduler, say a new function or new functionality,
they'd not know whether to grab both locks or just the rq->lock. The comments
left therein should explain which way to proceed with locking in the future.

> 
> 
>>
>>> +
>>>   /**
>>>    * drm_sched_rq_init - initialize a given run queue struct
>>>    *
>>> @@ -75,6 +134,7 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
>>>   {
>>>   	spin_lock_init(&rq->lock);
>>>   	INIT_LIST_HEAD(&rq->entities);
>>> +	rq->rb_tree_root = RB_ROOT_CACHED;
>>>   	rq->current_entity = NULL;
>>>   	rq->sched = sched;
>>>   }
>>> @@ -92,9 +152,12 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
>>>   {
>>>   	if (!list_empty(&entity->list))
>>>   		return;
>>> +
>>>   	spin_lock(&rq->lock);
>>> +
>>>   	atomic_inc(rq->sched->score);
>>>   	list_add_tail(&entity->list, &rq->entities);
>>> +
>>>   	spin_unlock(&rq->lock);
>>>   }
>>>   
>>> @@ -111,23 +174,32 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>>   {
>>>   	if (list_empty(&entity->list))
>>>   		return;
>>> +
>>>   	spin_lock(&rq->lock);
>>> +
>>> +
>>>   	atomic_dec(rq->sched->score);
>>>   	list_del_init(&entity->list);
>>> +
>>>   	if (rq->current_entity == entity)
>>>   		rq->current_entity = NULL;
>>> +
>>> +	if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
>>> +		drm_sched_rq_remove_fifo_locked(entity);
>>> +
>>>   	spin_unlock(&rq->lock);
>>>   }
>>>   
>>> +
>>>   /**
>>> - * drm_sched_rq_select_entity - Select an entity which could provide a job to run
>>> + * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run
>>>    *
>>>    * @rq: scheduler run queue to check.
>>>    *
>>>    * Try to find a ready entity, returns NULL if none found.
>>>    */
>>>   static struct drm_sched_entity *
>>> -drm_sched_rq_select_entity(struct drm_sched_rq *rq)
>>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq)
>>>   {
>>>   	struct drm_sched_entity *entity;
>>>   
>>> @@ -163,6 +235,58 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq)
>>>   	return NULL;
>>>   }
>>>   
>>> +
>>> +/**
>>> + * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run
>>> + *
>>> + * @rq: scheduler run queue to check.
>>> + *
>>> + * Try to find a ready entity, returns NULL if none found.
>>> + */
>>> +static struct drm_sched_entity *
>>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>>> +{
>>> +		struct drm_sched_entity *first, *entity = NULL;
>>> +		struct rb_node *rb;
>>> +
>>> +		spin_lock(&rq->lock);
>>> +
>>> +		rb = rb_first_cached(&rq->rb_tree_root);
>>> +		if (!rb)
>>> +			goto out;
>>> +
>>> +		first = rb_entry((rb), struct drm_sched_entity, rb_tree_node);
>>> +		entity = first;
>>> +
>>> +		while (true) {
>>> +
>>> +			if (drm_sched_entity_is_ready(entity)) {
>>> +				rq->current_entity = entity;
>>> +				reinit_completion(&entity->entity_idle);
>>> +				break;
>>> +			}
>>> +
>>> +			/*
>>> +			 * Push not ready entity to the end of the line so others
>>> +			 * have chance
>>> +			 */
>>> +			drm_sched_rq_update_fifo_locked(entity, ktime_get());
>>> +
>>> +
>>> +			rb = rb_first_cached(&rq->rb_tree_root);
>> You've an extra blank line above--checkpatch.pl with the options mentioned above
>> will reveal them all.
>>
>> You call drm_sched_rq_update_fifo_locked() with, what it seems, only the rq->lock
>> held, but in drm_sched_rq_update_fifo() you stipulate that both locks need to be held.
>> Should resolve this.
>>
>> Note that if no entity is ready, this code modifies all the timestamps of all entities
>> in the tree, and reinserts all the entities back again, for a total of O(N * log N),
>> O(N) to pick each one and O(log N) to reinsert, again, if no entity is ready and you
>> go over all of them. (Using rb_first_cached(), modify timestamp, rb_add_cached(),
>> rb_first_cached(), modify timestamp, rb_add_cached(), ..., and so on for all elements
>> in the tree).
>>
>> I feel that you shouldn't have to modify the time stamp of any entity, and that
>> you shouldn't have to modify the tree. You want to preserve the order in which
>> entities were added. Instead you should do an ordered walk, i.e. oldest entity to
>> youngest entity and pick the oldest one ready, using something like the following:
>>
>> static struct drm_sched_entity *
>> drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
>> {
>> 	struct rb_node *rb;
>>
>> 	spin_lock(&rq->lock);
>> 	for (rb = rb_first_cached(&rq->rb_tree); rb; rb = rb_next(rb)) {
>> 		struct drm_sched_entity *entity;
>>
>> 		entity = rb_entry(rb, struct drm_sched_entity, rb_node);
>> 		if (drm_sched_entity_is_ready(entity)) {
>> 			rq->current_entity = entity;
>> 			reinit_completion(&entity->entity_idle);
>> 			break;
>> 		}
>> 	}
>> 	spin_unlock(&rq->lock);
>>
>> 	return rb ? rb_entry(rb, struct drm_sched_entity, rb_node) : NULL;
>> }
>>
>> The idea is that this preserves the insertion order, the timestamp, and
>> you search for the oldest entity, which is also ready, to schedule--not getting
>> the time, and no tree insert/modify operations, other than an in-order walk.
> 
> 
> This one much more elegant and efficient agree but - imagine now you 
> have 100 idle
> entities and 1 active  (100 idle game instances and one running) - in my 
> approach you would (with one minor tweak of updating idle entity time 
> stamp to TIME64_MAX instead of current time ktime_get)
> spend 1 time nlogn effort to put the running game in the head of the 
> priority queue but since then and on
> all the idle entities would stay in the back of the queue until a real 
> job comes to them. For your approach
> we would always keep iterating over all of them first before reaching 
> active entity because they have oldest TS which we never updated.

I see what you want to achieve with this tweak, but it is still undesirable
to modify EVERY entity, and reinsert every entity, for a O(N*lnN) time complexity.
(If you used TIME64_MIN, you can modify just the ready entity and reinsert only it,
but this is still no good, as explained below.)

You can search the priority queue as I've shown above, and you can still
reorder by last-ready, but I contend that you shouldn't have to modify timestamps,
or reorder/reinsert at all, and do any of this.

First, in-order walk is O(N) which is much faster than O(N*lnN) for N > 2,
which means that you'll find the first ready entity much quicker using what I've shown
above.

Second, suppose that on a first iteration you did find a younger entity which is ready
skipping some old ones which are not yet ready for whatever reason. Then you did mess with
the timestamp of this younger entity making it oldest, the order, and you reinserted it to
the front of the queue. (This can be achieved by modifying only that element's timestamp,
not all.) Now suppose that an older entity, which had been waiting a long time became
ready--you'd miss that because in the FIFO, the younger entity has had its timestamp modified
and it was reinserted before that older entity, and if that younger entity is ready consistently,
then the older entity would be starved. In other words, you shouldn't reorder by ready-first,
timestamp-second, order. This defeats the purpose of the FIFO effort to avoid starvation of
older entities which became ready in the interim.

The correct way to address this is to use in-order walk, as I've shown above, in O(N)
time, and pick the first entity which is ready--which would be the oldest ready entity.

This way you really give priority to oldest entities which are also ready.

Meaning that you really want to preserve the real-time order in which they were submitted,
not the order in which they had been/became ready.

All you want to find is the oldest entity which is also ready, in the fastest way possible.

Regards,
Luben

> 
> Andrey
> 
> 
>>
>>> +			entity =  rb_entry((rb), struct drm_sched_entity, rb_tree_node);
>> Extra space after = and before rb_entry().
>>
>>> +
>>> +			/* We completed full cycle */
>>> +			if (!drm_sched_entity_is_ready(entity) && entity == first) {
>>> +				entity = NULL;
>>> +				break;
>>> +			}
>>> +		}
>>> +out:
>>> +		spin_unlock(&rq->lock);
>>> +		return entity;
>>> +}
>>> +
>>>   /**
>>>    * drm_sched_job_done - complete a job
>>>    * @s_job: pointer to the job which is done
>>> @@ -803,7 +927,9 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
>>>   
>>>   	/* Kernel run queue has higher priority than normal run queue*/
>>>   	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
>>> -		entity = drm_sched_rq_select_entity(&sched->sched_rq[i]);
>>> +		entity = drm_sched_policy != DRM_SCHED_POLICY_FIFO ?
>>> +				drm_sched_rq_select_entity_rr(&sched->sched_rq[i]) :
>>> +				drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]);
>>>   		if (entity)
>>>   			break;
>>>   	}
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index 599855c6a672..e3fdfb757d61 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -50,6 +50,13 @@ enum drm_sched_priority {
>>>   	DRM_SCHED_PRIORITY_UNSET = -2
>>>   };
>>>   
>>> +/* Used to chose between FIFO and RR jobs scheduling */
>>> +extern int drm_sched_policy;
>>> +
>>> +
>>> +#define DRM_SCHED_POLICY_RR    0
>>> +#define DRM_SCHED_POLICY_FIFO  1
>>> +
>>>   /**
>>>    * struct drm_sched_entity - A wrapper around a job queue (typically
>>>    * attached to the DRM file_priv).
>>> @@ -196,6 +203,21 @@ struct drm_sched_entity {
>>>   	 * drm_sched_entity_fini().
>>>   	 */
>>>   	struct completion		entity_idle;
>>> +
>>> +	/**
>>> +	 * @oldest_job_waiting:
>>> +	 *
>>> +	 * Marks earliest job waiting in SW queue
>>> +	 */
>>> +	ktime_t				oldest_job_waiting;
>>> +
>>> +	/**
>>> +	 * @rb_tree_node:
>>> +	 *
>>> +	 * The node used to insert this entity into time based priority queue
>>> +	 */
>>> +	struct rb_node			rb_tree_node;
>> I'd probably call it "rb_node", shorter--it's up to you.
>>
>>> +
>>>   };
>>>   
>>>   /**
>>> @@ -205,6 +227,7 @@ struct drm_sched_entity {
>>>    * @sched: the scheduler to which this rq belongs to.
>>>    * @entities: list of the entities to be scheduled.
>>>    * @current_entity: the entity which is to be scheduled.
>>> + * @rb_tree_root: root of time based priory queue of entities for FIFO scheduling
>>>    *
>>>    * Run queue is a set of entities scheduling command submissions for
>>>    * one specific ring. It implements the scheduling policy that selects
>>> @@ -215,6 +238,7 @@ struct drm_sched_rq {
>>>   	struct drm_gpu_scheduler	*sched;
>>>   	struct list_head		entities;
>>>   	struct drm_sched_entity		*current_entity;
>>> +	struct rb_root_cached		rb_tree_root;
>> I'd probably just call it "rb_tree", shorter--it's up to you.
>>
>> Regards,
>> Luben
>>
>>>   };
>>>   
>>>   /**
>>> @@ -314,6 +338,14 @@ struct drm_sched_job {
>>>   
>>>   	/** @last_dependency: tracks @dependencies as they signal */
>>>   	unsigned long			last_dependency;
>>> +
>>> +
>>> +	/**
>>> +	 * @submit_ts:
>>> +	 *
>>> +	 * When the job was pushed into the entity queue.
>>> +	 */
>>> +	ktime_t                         submit_ts;
>>>   };
>>>   
>>>   static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
>>> @@ -503,6 +535,9 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
>>>   void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
>>>   				struct drm_sched_entity *entity);
>>>   
>>> +void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts);
>>> +
>>> +
>>>   int drm_sched_entity_init(struct drm_sched_entity *entity,
>>>   			  enum drm_sched_priority priority,
>>>   			  struct drm_gpu_scheduler **sched_list,