On Wed, May 01, 2024 at 05:09:40AM -1000, Tejun Heo wrote: > When a task switches to a new sched_class, the prev and new classes are > notified through ->switched_from() and ->switched_to(), respectively, after > the switching is done. > > A new BPF extensible sched_class will have callbacks that allow the BPF > scheduler to keep track of relevant task states (like priority and cpumask). > Those callbacks aren't called while a task is on a different sched_class. > When a task comes back, we wanna tell the BPF progs the up-to-date state > before the task gets enqueued, so we need a hook which is called before the > switching is committed. > > This patch adds ->switching_to() which is called during sched_class switch > through check_class_changing() before the task is restored. Also, this patch > exposes check_class_changing/changed() in kernel/sched/sched.h. They will be > used by the new BPF extensible sched_class to implement implicit sched_class > switching which is used e.g. when falling back to CFS when the BPF scheduler > fails or unloads. > > This is a prep patch and doesn't cause any behavior changes. The new > operation and exposed functions aren't used yet. > > v2: Improve patch description w/ details on planned use. > > Signed-off-by: Tejun Heo <tj@xxxxxxxxxx> > Reviewed-by: David Vernet <dvernet@xxxxxxxx> > Acked-by: Josh Don <joshdon@xxxxxxxxxx> > Acked-by: Hao Luo <haoluo@xxxxxxxxxx> > Acked-by: Barret Rhoden <brho@xxxxxxxxxx> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 8e23f19e8096..99e292368d11 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -2301,6 +2301,7 @@ struct sched_class { > * cannot assume the switched_from/switched_to pair is serialized by > * rq->lock. They are however serialized by p->pi_lock. > */ > + void (*switching_to) (struct rq *this_rq, struct task_struct *task); > void (*switched_from)(struct rq *this_rq, struct task_struct *task); > void (*switched_to) (struct rq *this_rq, struct task_struct *task); > void (*reweight_task)(struct rq *this_rq, struct task_struct *task, So I *think* that I can handle all the current cases in sched_class::{en,de}queue_task() if we add {EN,DE}QUEUE_CLASS flags. Would that work for the BPF thing as well? Something like the very much incomplete below... It would allow removing all these switch{ed,ing}_{to,from}() things entirely, instead of adding yet more. --- diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0935f9d4bb7b..da54c9f8f78d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6864,15 +6864,22 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); -void __setscheduler_prio(struct task_struct *p, int prio) +struct sched_class *__setscheduler_class(int prio) { + struct sched_class *class; + if (dl_prio(prio)) - p->sched_class = &dl_sched_class; + class = &dl_sched_class; else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; + class = &rt_sched_class; else - p->sched_class = &fair_sched_class; + class = &fair_sched_class; + return class; +} + +void __setscheduler_prio(struct task_struct *p, int prio) +{ p->prio = prio; } @@ -6919,7 +6926,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *class; struct rq_flags rf; struct rq *rq; @@ -6977,6 +6984,10 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + class = __setscheduler_class(prio); + if (prev_class != class) + queue_flags |= DEQUEUE_CLASS; + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7014,6 +7025,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->rt.timeout = 0; } + p->class = class; __setscheduler_prio(p, prio); if (queued) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 62fd8bc6fd08..a03995d81c75 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2251,6 +2251,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_CLASS 0x200 /* Matches ENQUEUE_CLASS */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2266,6 +2267,7 @@ extern const u32 sched_prio_to_wmult[40]; #endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 +#define ENQUEUE_CLASS 0x200 #define RETRY_TASK ((void *)-1UL) @@ -3603,6 +3605,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); +extern struct sched_class *__setscheduler_class(int prio); extern void __setscheduler_prio(struct task_struct *p, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index ae1b42775ef9..dc104d996204 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -612,7 +612,7 @@ int __sched_setscheduler(struct task_struct *p, { int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *class; struct balance_callback *head; struct rq_flags rf; int reset_on_fork; @@ -783,6 +783,12 @@ int __sched_setscheduler(struct task_struct *p, queue_flags &= ~DEQUEUE_MOVE; } + class = prev_class = p->sched_class; + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) + class = __setscheduler_class(newprio); + if (prev_class != class) + queue_flags |= DEQUEUE_CLASS; + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -790,10 +796,9 @@ int __sched_setscheduler(struct task_struct *p, if (running) put_prev_task(rq, p); - prev_class = p->sched_class; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); + p->class = class; __setscheduler_prio(p, newprio); } __setscheduler_uclamp(p, attr);