Inlined: On 2022-09-22 12:15, Andrey Grodzovsky wrote: > > On 2022-09-22 11:03, Luben Tuikov wrote: >> The title of this patch has "v3", but "v4" in the title prefix. >> If you're using "-v" to git-format-patch, please remove the "v3" from the title. >> >> Inlined: >> >> On 2022-09-21 14:28, Andrey Grodzovsky wrote: >>> When many entities competing for same run queue on >>> the same scheduler When many entities have unacceptably long wait >>> time for some jobs waiting stuck in the run queue before being picked >>> up are observed (seen using GPUVis). >> Use this as your opening: >> >> "When many entities are competing for the same run queue on the same scheduler, >> we observe an unusually long wait times and some jobs get starved. This has >> been observed on GPUVis." >> >>> The issue is due to the Round Robin policy used by schedulers >>> to pick up the next entity's job queue for execution. Under stress >>> of many entities and long job queues within entity some >>> jobs could be stack for very long time in it's entity's >> "stuck", not "stack". >> >>> queue before being popped from the queue and executed >>> while for other entities with smaller job queues a job >>> might execute earlier even though that job arrived later >>> then the job in the long queue. >>> >>> Fix: >>> Add FIFO selection policy to entities in run queue, chose next entity >>> on run queue in such order that if job on one entity arrived >>> earlier then job on another entity the first job will start >>> executing earlier regardless of the length of the entity's job >>> queue. >>> >>> v2: >>> Switch to rb tree structure for entities based on TS of >>> oldest job waiting in the job queue of an entity. Improves next >>> entity extraction to O(1). Entity TS update >>> O(log N) where N is the number of entities in the run-queue >>> >>> Drop default option in module control parameter. >>> >>> v3: >>> Various cosmetical fixes and minor refactoring of fifo update function. (Luben) >>> >>> v4: >>> Switch drm_sched_rq_select_entity_fifo to in order search (Luben) >>> >>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> >>> Tested-by: Li Yunxiang (Teddy) <Yunxiang.Li@xxxxxxx> >>> --- >>> drivers/gpu/drm/scheduler/sched_entity.c | 26 +++++- >>> drivers/gpu/drm/scheduler/sched_main.c | 107 ++++++++++++++++++++++- >>> include/drm/gpu_scheduler.h | 32 +++++++ >>> 3 files changed, 159 insertions(+), 6 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c >>> index 6b25b2f4f5a3..f3ffce3c9304 100644 >>> --- a/drivers/gpu/drm/scheduler/sched_entity.c >>> +++ b/drivers/gpu/drm/scheduler/sched_entity.c >>> @@ -73,6 +73,7 @@ int drm_sched_entity_init(struct drm_sched_entity *entity, >>> entity->priority = priority; >>> entity->sched_list = num_sched_list > 1 ? sched_list : NULL; >>> entity->last_scheduled = NULL; >>> + RB_CLEAR_NODE(&entity->rb_tree_node); >>> >>> if(num_sched_list) >>> entity->rq = &sched_list[0]->sched_rq[entity->priority]; >>> @@ -417,14 +418,16 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) >>> >>> sched_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue)); >>> if (!sched_job) >>> - return NULL; >>> + goto skip; >>> >>> while ((entity->dependency = >>> drm_sched_job_dependency(sched_job, entity))) { >>> trace_drm_sched_job_wait_dep(sched_job, entity->dependency); >>> >>> - if (drm_sched_entity_add_dependency_cb(entity)) >>> - return NULL; >>> + if (drm_sched_entity_add_dependency_cb(entity)) { >>> + sched_job = NULL; >>> + goto skip; >>> + } >>> } >>> >>> /* skip jobs from entity that marked guilty */ >>> @@ -443,6 +446,16 @@ struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity) >>> smp_wmb(); >>> >>> spsc_queue_pop(&entity->job_queue); >>> + >>> + /* >>> + * It's when head job is extracted we can access the next job (or empty) >>> + * queue and update the entity location in the min heap accordingly. >>> + */ >>> +skip: >>> + if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) >>> + drm_sched_rq_update_fifo(entity, >>> + (sched_job ? sched_job->submit_ts : ktime_get())); >>> + >>> return sched_job; >>> } >>> >>> @@ -502,11 +515,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) >>> { >>> struct drm_sched_entity *entity = sched_job->entity; >>> bool first; >>> + ktime_t ts = ktime_get(); >>> >>> trace_drm_sched_job(sched_job, entity); >>> atomic_inc(entity->rq->sched->score); >>> WRITE_ONCE(entity->last_user, current->group_leader); >>> first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node); >>> + sched_job->submit_ts = ts; >>> >>> /* first job wakes up scheduler */ >>> if (first) { >>> @@ -518,8 +533,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) >>> DRM_ERROR("Trying to push to a killed entity\n"); >>> return; >>> } >>> + >>> drm_sched_rq_add_entity(entity->rq, entity); >>> spin_unlock(&entity->rq_lock); >>> + >>> + if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) >>> + drm_sched_rq_update_fifo(entity, ts); >>> + >>> drm_sched_wakeup(entity->rq->sched); >>> } >>> } >>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c >>> index 4f2395d1a791..565707a1c5c7 100644 >>> --- a/drivers/gpu/drm/scheduler/sched_main.c >>> +++ b/drivers/gpu/drm/scheduler/sched_main.c >>> @@ -62,6 +62,64 @@ >>> #define to_drm_sched_job(sched_job) \ >>> container_of((sched_job), struct drm_sched_job, queue_node) >>> >>> +int drm_sched_policy = DRM_SCHED_POLICY_RR; >>> + >>> +/** >>> + * DOC: sched_policy (int) >>> + * Used to override default entities scheduling policy in a run queue. >>> + */ >>> +MODULE_PARM_DESC(sched_policy, >>> + "specify schedule policy for entities on a runqueue (DRM_SCHED_POLICY_RR = Round Robin (default), DRM_SCHED_POLICY_FIFO = use FIFO"); >>> +module_param_named(sched_policy, drm_sched_policy, int, 0444); >>> + >>> +static __always_inline bool drm_sched_entity_compare_before(struct rb_node *a, >>> + const struct rb_node *b) >>> +{ >>> + struct drm_sched_entity *ent_a = rb_entry((a), struct drm_sched_entity, rb_tree_node); >>> + struct drm_sched_entity *ent_b = rb_entry((b), struct drm_sched_entity, rb_tree_node); >>> + >>> + return ktime_before(ent_a->oldest_job_waiting, ent_b->oldest_job_waiting); >>> +} >>> + >>> +static inline void drm_sched_rq_remove_fifo_locked(struct drm_sched_entity *entity) >>> +{ >>> + struct drm_sched_rq *rq = entity->rq; >>> + >>> + if (!RB_EMPTY_NODE(&entity->rb_tree_node)) { >>> + rb_erase_cached(&entity->rb_tree_node, &rq->rb_tree_root); >>> + RB_CLEAR_NODE(&entity->rb_tree_node); >>> + } >>> +} >>> + >>> +static inline void drm_sched_rq_update_fifo_locked(struct drm_sched_entity *entity, >>> + ktime_t ts) >>> +{ >>> + struct drm_sched_rq *rq = entity->rq; >>> + >>> + drm_sched_rq_remove_fifo_locked(entity); >>> + >>> + entity->oldest_job_waiting = ts; >>> + >>> + rb_add_cached(&entity->rb_tree_node, &rq->rb_tree_root, >>> + drm_sched_entity_compare_before); >>> +} >>> + >>> +void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts) >>> +{ >>> + /* >>> + * Both locks need to be grabbed, one to protect from entity->rq change >>> + * for entity from within concurrent drm_sched_entity_select_rq and the >>> + * other to update the rb tree structure. >>> + */ >>> + spin_lock(&entity->rq_lock); >>> + spin_lock(&entity->rq->lock); >>> + >>> + drm_sched_rq_update_fifo_locked(entity, ts); >>> + >>> + spin_unlock(&entity->rq->lock); >>> + spin_unlock(&entity->rq_lock); >>> +} >> I thought you were going to drop one of the locks here? >> Address this by either updating the comment to explain to future programmers >> what is going on here and why the locking is not consistent (2 vs 1 lock), >> or drop one of the locks, as per my previous review. > > > Note that after last review drm_sched_rq_update_fifo_locked is not > called anywhere > besides drm_sched_rq_update_fifo and so becomes obsolete and I will > remove it now. > In this case the double locking above is consistent and the reason is > explained in the > comment above. WHen you say "it's consistent" you mean it's being done from other places, I suppose. > > >> >>> + >>> /** >>> * drm_sched_rq_init - initialize a given run queue struct >>> * >>> @@ -75,6 +133,7 @@ static void drm_sched_rq_init(struct drm_gpu_scheduler *sched, >>> { >>> spin_lock_init(&rq->lock); >>> INIT_LIST_HEAD(&rq->entities); >>> + rq->rb_tree_root = RB_ROOT_CACHED; >>> rq->current_entity = NULL; >>> rq->sched = sched; >>> } >>> @@ -92,9 +151,12 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq, >>> { >>> if (!list_empty(&entity->list)) >>> return; >>> + >>> spin_lock(&rq->lock); >>> + >>> atomic_inc(rq->sched->score); >>> list_add_tail(&entity->list, &rq->entities); >>> + >>> spin_unlock(&rq->lock); >>> } >>> >>> @@ -111,23 +173,30 @@ void drm_sched_rq_remove_entity(struct drm_sched_rq *rq, >>> { >>> if (list_empty(&entity->list)) >>> return; >>> + >>> spin_lock(&rq->lock); >>> + >>> atomic_dec(rq->sched->score); >>> list_del_init(&entity->list); >>> + >>> if (rq->current_entity == entity) >>> rq->current_entity = NULL; >>> + >>> + if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) >>> + drm_sched_rq_remove_fifo_locked(entity); >>> + >>> spin_unlock(&rq->lock); >>> } >>> >>> /** >>> - * drm_sched_rq_select_entity - Select an entity which could provide a job to run >>> + * drm_sched_rq_select_entity_rr - Select an entity which could provide a job to run >>> * >>> * @rq: scheduler run queue to check. >>> * >>> * Try to find a ready entity, returns NULL if none found. >>> */ >>> static struct drm_sched_entity * >>> -drm_sched_rq_select_entity(struct drm_sched_rq *rq) >>> +drm_sched_rq_select_entity_rr(struct drm_sched_rq *rq) >>> { >>> struct drm_sched_entity *entity; >>> >>> @@ -163,6 +232,36 @@ drm_sched_rq_select_entity(struct drm_sched_rq *rq) >>> return NULL; >>> } >>> >>> +/** >>> + * drm_sched_rq_select_entity_fifo - Select an entity which provides a job to run >>> + * >>> + * @rq: scheduler run queue to check. >>> + * >>> + * Find oldest waiting ready entity, returns NULL if none found. >>> + */ >>> +static struct drm_sched_entity * >>> +drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq) >>> +{ >>> + struct rb_node *rb; >>> + bool found = false; >>> + struct drm_sched_entity *entity; >>> + >>> + spin_lock(&rq->lock); >>> + for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) { >>> + entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node); >>> + >>> + if (drm_sched_entity_is_ready(entity)) { >>> + rq->current_entity = entity; >>> + reinit_completion(&entity->entity_idle); >>> + found = true; >>> + break; >>> + } >>> + } >>> + spin_unlock(&rq->lock); >>> + >>> + return found ? entity : NULL; >>> +} >> You really don't need "found", and you don't need "entity" to be outside the loop. >> >> As per my last review, use this (which I've compiled and run): >> >> static struct drm_sched_entity * >> drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq) >> { >> struct rb_node *rb; >> >> spin_lock(&rq->lock); >> for (rb = rb_first_cached(&rq->rb_tree_root); rb; rb = rb_next(rb)) { >> struct drm_sched_entity *entity; >> >> entity = rb_entry(rb, struct drm_sched_entity, rb_tree_node); >> if (drm_sched_entity_is_ready(entity)) { >> rq->current_entity = entity; >> reinit_completion(&entity->entity_idle); >> break; >> } >> } >> spin_unlock(&rq->lock); >> >> return rb ? rb_entry(rb, struct drm_sched_entity, rb_tree_node) : NULL; >> } >> >> The only way we can exit the loop is, >> 1. The loop invariant is false, i.e. rb == NULL, or >> 2. The "break;" jump from inside the if () inside the loop. >> >> Also note that "rb" does NOT need to be initialized, since, the for() statement >> is always executed, and the assignment "rb = rb_first_cached(&rq->rb_tree_root);" >> initializes it--this is why GCC doesn't complain :-) >> >> Also note that you don't need to export "entity" as it makes the for() loop relocatable >> to another function with only having a dependency on "rb" being defined--the loop >> is self-contained. >> >> At the "return" statement, we know that we've exited the loop, by either the loop >> invariant being false, i.e. rb == NULL, or by the "break;" jump, in which case >> we know that rb != NULL. This is why the "return;" statement as presented above works >> correctly. >> >> Please use that function in the way it is above, which is minimal and mature. > > > Makes sense, missed the point that at the end rb will be set to NULL Yeah, use the loop as I've written it above. Okay, send out v5. Regards, Luben > > Andrey > > >> >> Regards, >> Luben >> >>> + >>> /** >>> * drm_sched_job_done - complete a job >>> * @s_job: pointer to the job which is done >>> @@ -803,7 +902,9 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched) >>> >>> /* Kernel run queue has higher priority than normal run queue*/ >>> for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { >>> - entity = drm_sched_rq_select_entity(&sched->sched_rq[i]); >>> + entity = drm_sched_policy != DRM_SCHED_POLICY_FIFO ? >>> + drm_sched_rq_select_entity_rr(&sched->sched_rq[i]) : >>> + drm_sched_rq_select_entity_fifo(&sched->sched_rq[i]); >>> if (entity) >>> break; >>> } >>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h >>> index 599855c6a672..1f7d9dd1a444 100644 >>> --- a/include/drm/gpu_scheduler.h >>> +++ b/include/drm/gpu_scheduler.h >>> @@ -50,6 +50,12 @@ enum drm_sched_priority { >>> DRM_SCHED_PRIORITY_UNSET = -2 >>> }; >>> >>> +/* Used to chose between FIFO and RR jobs scheduling */ >>> +extern int drm_sched_policy; >>> + >>> +#define DRM_SCHED_POLICY_RR 0 >>> +#define DRM_SCHED_POLICY_FIFO 1 >>> + >>> /** >>> * struct drm_sched_entity - A wrapper around a job queue (typically >>> * attached to the DRM file_priv). >>> @@ -196,6 +202,21 @@ struct drm_sched_entity { >>> * drm_sched_entity_fini(). >>> */ >>> struct completion entity_idle; >>> + >>> + /** >>> + * @oldest_job_waiting: >>> + * >>> + * Marks earliest job waiting in SW queue >>> + */ >>> + ktime_t oldest_job_waiting; >>> + >>> + /** >>> + * @rb_tree_node: >>> + * >>> + * The node used to insert this entity into time based priority queue >>> + */ >>> + struct rb_node rb_tree_node; >>> + >>> }; >>> >>> /** >>> @@ -205,6 +226,7 @@ struct drm_sched_entity { >>> * @sched: the scheduler to which this rq belongs to. >>> * @entities: list of the entities to be scheduled. >>> * @current_entity: the entity which is to be scheduled. >>> + * @rb_tree_root: root of time based priory queue of entities for FIFO scheduling >>> * >>> * Run queue is a set of entities scheduling command submissions for >>> * one specific ring. It implements the scheduling policy that selects >>> @@ -215,6 +237,7 @@ struct drm_sched_rq { >>> struct drm_gpu_scheduler *sched; >>> struct list_head entities; >>> struct drm_sched_entity *current_entity; >>> + struct rb_root_cached rb_tree_root; >>> }; >>> >>> /** >>> @@ -314,6 +337,13 @@ struct drm_sched_job { >>> >>> /** @last_dependency: tracks @dependencies as they signal */ >>> unsigned long last_dependency; >>> + >>> + /** >>> + * @submit_ts: >>> + * >>> + * When the job was pushed into the entity queue. >>> + */ >>> + ktime_t submit_ts; >>> }; >>> >>> static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job, >>> @@ -503,6 +533,8 @@ void drm_sched_rq_add_entity(struct drm_sched_rq *rq, >>> void drm_sched_rq_remove_entity(struct drm_sched_rq *rq, >>> struct drm_sched_entity *entity); >>> >>> +void drm_sched_rq_update_fifo(struct drm_sched_entity *entity, ktime_t ts); >>> + >>> int drm_sched_entity_init(struct drm_sched_entity *entity, >>> enum drm_sched_priority priority, >>> struct drm_gpu_scheduler **sched_list,