Inlined: On 2022-09-28 14:49, Andrey Grodzovsky wrote: > When many entities are competing for the same run queue > on the same scheduler, we observe an unusually long wait > times and some jobs get starved. This has been observed on GPUVis. > > The issue is due to the Round Robin policy used by schedulers > to pick up the next entity's job queue for execution. Under stress > of many entities and long job queues within entity some > jobs could be stuck for very long time in it's entity's > queue before being popped from the queue and executed > while for other entities with smaller job queues a job > might execute earlier even though that job arrived later > then the job in the long queue. > > Fix: > Add FIFO selection policy to entities in run queue, chose next entity > on run queue in such order that if job on one entity arrived > earlier then job on another entity the first job will start > executing earlier regardless of the length of the entity's job > queue. > > v2: > Switch to rb tree structure for entities based on TS of > oldest job waiting in the job queue of an entity. Improves next > entity extraction to O(1). Entity TS update > O(log N) where N is the number of entities in the run-queue > > Drop default option in module control parameter. > > v3: > Various cosmetical fixes and minor refactoring of fifo update function. (Luben) "Cosmetic" > > v4: > Switch drm_sched_rq_select_entity_fifo to in order search (Luben) > > v5: Fix up drm_sched_rq_select_entity_fifo loop v5 is actually a major major update of the whole patch and working of the patch. It fixes the entity selection from ready-first, timestamp-second, to oldest-first and ready-second, i.e. oldest-ready entity, as I suggested. The v5 blurb should read: v5: Fix drm_sched_rq_select_entity_fifo loop from reordering and timestamping on every selection, and picking the last ready first, timestamp-second, to picking the oldest-ready entity. No reinsertion. (Luben) > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> > Tested-by: Li Yunxiang (Teddy) <Yunxiang.Li@xxxxxxx> > --- > drivers/gpu/drm/scheduler/sched_entity.c | 26 ++++++- > drivers/gpu/drm/scheduler/sched_main.c | 99 +++++++++++++++++++++++- > include/drm/gpu_scheduler.h | 32 ++++++++ > 3 files changed, 151 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c > index 6b25b2f4f5a3..f3ffce3c9304 100644 > --- a/drivers/gpu/drm/scheduler/sched_entity.c > +++ b/drivers/gpu/drm/scheduler/sched_entity.c > @@ -73,6 +73,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, > entity->priority = priority; > entity->sched_list = num_sched_list > 1 ? sched_list : NULL; > entity->last_scheduled = NULL; > + RB_CLEAR_NODE(&entity->rb_tree_node); > > if(num_sched_list) > entity->rq = &sched_list[0]->sched_rq[entity->priority]; > @@ -417,14 +418,16 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) > > sched_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue)); > if (!sched_job) > - return NULL; > + goto skip; > > while ((entity->dependency = > drm_sched_job_dependency(sched_job, entity))) { > trace_drm_sched_job_wait_dep(sched_job, entity->dependency); > > - if (drm_sched_entity_add_dependency_cb(entity)) > - return NULL; > + if (drm_sched_entity_add_dependency_cb(entity)) { > + sched_job = NULL; > + goto skip; > + } > } > > /* skip jobs from entity that marked guilty */ > @@ -443,6 +446,16 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) > smp_wmb(); > > spsc_queue_pop(&entity->job_queue); > + > + /* > + * It's when head job is extracted we can access the next job (or empty) > + * queue and update the entity location in the min heap accordingly. > + */ > +skip: > + if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) > + drm_sched_rq_update_fifo(entity, > + (sched_job ? sched_job->submit_ts : ktime_get())); > + > return sched_job; > } > > @@ -502,11 +515,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) > { > struct drm_sched_entity *entity = sched_job->entity; > bool first; > + ktime_t ts = ktime_get(); > > trace_drm_sched_job(sched_job, entity); > atomic_inc(entity->rq->sched->score); > WRITE_ONCE(entity->last_user, current->group_leader); > first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node); > + sched_job->submit_ts = ts; > > /* first job wakes up scheduler */ > if (first) { > @@ -518,8 +533,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) > DRM_ERROR("Trying to push to a killed entity\n"); > return; > } > + > drm_sched_rq_add_entity(entity->rq, entity); > spin_unlock(&entity->rq_lock); > + > + if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) > + drm_sched_rq_update_fifo(entity, ts); > + > drm_sched_wakeup(entity->rq->sched); > } > } > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index 4f2395d1a791..5349fc049384 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -62,6 +62,58 @@ > #define to_drm_sched_job(sched_job) \ > container_of((sched_job), struct drm_sched_job, queue_node) > > +int drm_sched_policy = DRM_SCHED_POLICY_RR; > + > +/** > + * DOC: sched_policy (int) > + * Used to override default entities scheduling policy in a run queue. > + */ > +MODULE_PARM_DESC(sched_policy, > + "specify schedule policy for entities on a runqueue (DRM_SCHED_POLICY_RR = Round Robin (default), DRM_SCHED_POLICY_FIFO = use FIFO"); > +module_param_named(sched_policy, drm_sched_policy, int, 0444); > + > +static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a, > + const struct rb_node *b) > +{ > + struct drm_sched_entity *ent_a = rb_entry((a), struct drm_sched_entity, rb_tree_node); > + struct drm_sched_entity *ent_b = rb_entry((b), struct drm_sched_entity, rb_tree_node); > + > + return ktime_before(ent_a->oldest_job_waiting, ent_b->oldest_job_waiting); > +} > + > +static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity) > +{ > + struct drm_sched_rq *rq = entity->rq; > + > + if (!RB_EMPTY_NODE(&entity->rb_tree_node)) { > + rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root); > + RB_CLEAR_NODE(&entity->rb_tree_node); > + } > +} > + > +void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts) > +{ > + struct drm_sched_rq *rq; > + > + /* > + * Both locks need to be grabbed, one to protect from entity->rq change > + * for entity from within concurrent drm_sched_entity_select_rq and the > + * other to update the rb tree structure. > + */ > + spin_lock(&entity->rq_lock); > + spin_lock(&entity->rq->lock); > + > + rq = entity->rq; > + > + entity->oldest_job_waiting = ts; > + > + rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root, > + drm_sched_entity_compare_before); > + > + spin_unlock(&entity->rq->lock); > + spin_unlock(&entity->rq_lock); > +} > + > /** > * drm_sched_rq_init - initialize a given run queue struct > * > @@ -75,6 +127,7 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched, > { > spin_lock_init(&rq->lock); > INIT_LIST_HEAD(&rq->entities); > + rq->rb_tree_root = RB_ROOT_CACHED; > rq->current_entity = NULL; > rq->sched = sched; > } > @@ -92,9 +145,12 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq, > { > if (!list_empty(&entity->list)) > return; > + > spin_lock(&rq->lock); > + > atomic_inc(rq->sched->score); > list_add_tail(&entity->list, &rq->entities); > + > spin_unlock(&rq->lock); > } > > @@ -111,23 +167,30 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq, > { > if (list_empty(&entity->list)) > return; > + > spin_lock(&rq->lock); > + > atomic_dec(rq->sched->score); > list_del_init(&entity->list); > + > if (rq->current_entity == entity) > rq->current_entity = NULL; > + > + if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) > + drm_sched_rq_remove_fifo_locked(entity); > + > spin_unlock(&rq->lock); > } > > /** > - * drm_sched_rq_select_entity - Select an entity which could provide a job to run > + * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run > * > * @rq: scheduler run queue to check. > * > * Try to find a ready entity, returns NULL if none found. > */ > static struct drm_sched_entity * > -drm_sched_rq_select_entity(struct drm_sched_rq *rq) > +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq) > { > struct drm_sched_entity *entity; > > @@ -163,6 +226,34 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq) > return NULL; > } > > +/** > + * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run > + * > + * @rq: scheduler run queue to check. > + * > + * Find oldest waiting ready entity, returns NULL if none found. > + */ > +static struct drm_sched_entity * > +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq) > +{ > + struct rb_node *rb; > + > + spin_lock(&rq->lock); > + for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) { > + struct drm_sched_entity *entity; > + > + entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node); > + if (drm_sched_entity_is_ready(entity)) { > + rq->current_entity = entity; > + reinit_completion(&entity->entity_idle); > + break; > + } > + } > + spin_unlock(&rq->lock); > + > + return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL; > +} Yay! With the changes to v5 blurb and testing this patch is: Reviewed-by: Luben Tuikov <luben.tuikov@xxxxxxx> Regards, Luben > + > /** > * drm_sched_job_done - complete a job > * @s_job: pointer to the job which is done > @@ -803,7 +894,9 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched) > > /* Kernel run queue has higher priority than normal run queue*/ > for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { > - entity = drm_sched_rq_select_entity(&sched->sched_rq[i]); > + entity = drm_sched_policy != DRM_SCHED_POLICY_FIFO ? > + drm_sched_rq_select_entity_rr(&sched->sched_rq[i]) : > + drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]); > if (entity) > break; > } > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index 599855c6a672..1f7d9dd1a444 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -50,6 +50,12 @@ enum drm_sched_priority { > DRM_SCHED_PRIORITY_UNSET = -2 > }; > > +/* Used to chose between FIFO and RR jobs scheduling */ > +extern int drm_sched_policy; > + > +#define DRM_SCHED_POLICY_RR 0 > +#define DRM_SCHED_POLICY_FIFO 1 > + > /** > * struct drm_sched_entity - A wrapper around a job queue (typically > * attached to the DRM file_priv). > @@ -196,6 +202,21 @@ struct drm_sched_entity { > * drm_sched_entity_fini(). > */ > struct completion entity_idle; > + > + /** > + * @oldest_job_waiting: > + * > + * Marks earliest job waiting in SW queue > + */ > + ktime_t oldest_job_waiting; > + > + /** > + * @rb_tree_node: > + * > + * The node used to insert this entity into time based priority queue > + */ > + struct rb_node rb_tree_node; > + > }; > > /** > @@ -205,6 +226,7 @@ struct drm_sched_entity { > * @sched: the scheduler to which this rq belongs to. > * @entities: list of the entities to be scheduled. > * @current_entity: the entity which is to be scheduled. > + * @rb_tree_root: root of time based priory queue of entities for FIFO scheduling > * > * Run queue is a set of entities scheduling command submissions for > * one specific ring. It implements the scheduling policy that selects > @@ -215,6 +237,7 @@ struct drm_sched_rq { > struct drm_gpu_scheduler *sched; > struct list_head entities; > struct drm_sched_entity *current_entity; > + struct rb_root_cached rb_tree_root; > }; > > /** > @@ -314,6 +337,13 @@ struct drm_sched_job { > > /** @last_dependency: tracks @dependencies as they signal */ > unsigned long last_dependency; > + > + /** > + * @submit_ts: > + * > + * When the job was pushed into the entity queue. > + */ > + ktime_t submit_ts; > }; > > static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, > @@ -503,6 +533,8 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq, > void drm_sched_rq_remove_entity(struct drm_sched_rq *rq, > struct drm_sched_entity *entity); > > +void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts); > + > int drm_sched_entity_init(struct drm_sched_entity *entity, > enum drm_sched_priority priority, > struct drm_gpu_scheduler **sched_list,