Re: [PATCH 12/36] sched_ext: Implement BPF extensible scheduler class

Andrea Righi <andrea.righi@xxxxxxxxxxxxx> · Thu, 23 Nov 2023 09:07:35 +0100

On Fri, Nov 10, 2023 at 04:47:38PM -1000, Tejun Heo wrote:
...
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3961,6 +3961,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
>  
>  static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
>  {
> +	/*
> +	 * The BPF scheduler may depend on select_task_rq() being invoked during
> +	 * wakeups. In addition, @p may end up executing on a different CPU
> +	 * regardless of what happens in the wakeup path making the ttwu_queue
> +	 * optimization less meaningful. Skip if on SCX.
> +	 */
> +	if (task_on_scx(p))
> +		return false;
> +
>  	/*
>  	 * Do not complicate things with the async wake_list while the CPU is
>  	 * in hotplug state.
> @@ -4531,6 +4540,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>  	p->rt.on_rq		= 0;
>  	p->rt.on_list		= 0;
>  
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	p->scx.dsq		= NULL;
> +	INIT_LIST_HEAD(&p->scx.dsq_node);
> +	p->scx.flags		= 0;
> +	p->scx.weight		= 0;
> +	p->scx.sticky_cpu	= -1;
> +	p->scx.holding_cpu	= -1;
> +	p->scx.kf_mask		= 0;
> +	atomic64_set(&p->scx.ops_state, 0);

We probably need atomic_long_set() here or in 32-bit arches (such as
armhf) we get this:

kernel/sched/core.c:4564:22: error: passing argument 1 of ‘atomic64_set’ from incompatible pointer type [-Werror=incompatible-pointer-types]
 4564 |         atomic64_set(&p->scx.ops_state, 0);
      |                      ^~~~~~~~~~~~~~~~~
      |                      |
      |                      atomic_long_t * {aka atomic_t *}

> +	p->scx.slice		= SCX_SLICE_DFL;
> +#endif
> +
>  #ifdef CONFIG_PREEMPT_NOTIFIERS
>  	INIT_HLIST_HEAD(&p->preempt_notifiers);
>  #endif
> @@ -4779,6 +4800,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
>  		goto out_cancel;
>  	} else if (rt_prio(p->prio)) {
>  		p->sched_class = &rt_sched_class;
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	} else if (task_should_scx(p)) {
> +		p->sched_class = &ext_sched_class;
> +#endif
>  	} else {
>  		p->sched_class = &fair_sched_class;
>  	}
> @@ -7059,6 +7084,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
>  		p->sched_class = &dl_sched_class;
>  	else if (rt_prio(prio))
>  		p->sched_class = &rt_sched_class;
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	else if (task_should_scx(p))
> +		p->sched_class = &ext_sched_class;
> +#endif
>  	else
>  		p->sched_class = &fair_sched_class;
>  
> @@ -9055,6 +9084,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
>  	case SCHED_NORMAL:
>  	case SCHED_BATCH:
>  	case SCHED_IDLE:
> +	case SCHED_EXT:
>  		ret = 0;
>  		break;
>  	}
> @@ -9082,6 +9112,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
>  	case SCHED_NORMAL:
>  	case SCHED_BATCH:
>  	case SCHED_IDLE:
> +	case SCHED_EXT:
>  		ret = 0;
>  	}
>  	return ret;
> @@ -9918,6 +9949,10 @@ void __init sched_init(void)
>  	BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
>  	BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
>  	BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
> +	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
> +#endif
>  
>  	wait_bit_init();
>  
> @@ -12047,3 +12082,38 @@ void sched_mm_cid_fork(struct task_struct *t)
>  	t->mm_cid_active = 1;
>  }
>  #endif
> +
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
> +			    struct sched_enq_and_set_ctx *ctx)
> +{
> +	struct rq *rq = task_rq(p);
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	*ctx = (struct sched_enq_and_set_ctx){
> +		.p = p,
> +		.queue_flags = queue_flags,
> +		.queued = task_on_rq_queued(p),
> +		.running = task_current(rq, p),
> +	};
> +
> +	update_rq_clock(rq);
> +	if (ctx->queued)
> +		dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
> +	if (ctx->running)
> +		put_prev_task(rq, p);
> +}
> +
> +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
> +{
> +	struct rq *rq = task_rq(ctx->p);
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	if (ctx->queued)
> +		enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
> +	if (ctx->running)
> +		set_next_task(rq, ctx->p);
> +}
> +#endif	/* CONFIG_SCHED_CLASS_EXT */
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 4580a450700e..6587a45ffe96 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -374,6 +374,9 @@ static __init int sched_init_debug(void)
>  
>  	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
>  
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	debugfs_create_file("ext", 0444, debugfs_sched, NULL, &sched_ext_fops);
> +#endif
>  	return 0;
>  }
>  late_initcall(sched_init_debug);
> @@ -1085,6 +1088,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>  		P(dl.runtime);
>  		P(dl.deadline);
>  	}
> +#ifdef CONFIG_SCHED_CLASS_EXT
> +	__PS("ext.enabled", task_on_scx(p));
> +#endif
>  #undef PN_SCHEDSTAT
>  #undef P_SCHEDSTAT
>  
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> new file mode 100644
> index 000000000000..7b78f77d2293
> --- /dev/null
> +++ b/kernel/sched/ext.c
> @@ -0,0 +1,3158 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
> + * Copyright (c) 2022 Tejun Heo <tj@xxxxxxxxxx>
> + * Copyright (c) 2022 David Vernet <dvernet@xxxxxxxx>
> + */
> +#define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
> +
> +enum scx_internal_consts {
> +	SCX_NR_ONLINE_OPS	= SCX_OP_IDX(init),
> +	SCX_DSP_DFL_MAX_BATCH	= 32,
> +};
> +
> +enum scx_ops_enable_state {
> +	SCX_OPS_PREPPING,
> +	SCX_OPS_ENABLING,
> +	SCX_OPS_ENABLED,
> +	SCX_OPS_DISABLING,
> +	SCX_OPS_DISABLED,
> +};
> +
> +/*
> + * sched_ext_entity->ops_state
> + *
> + * Used to track the task ownership between the SCX core and the BPF scheduler.
> + * State transitions look as follows:
> + *
> + * NONE -> QUEUEING -> QUEUED -> DISPATCHING
> + *   ^              |                 |
> + *   |              v                 v
> + *   \-------------------------------/
> + *
> + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
> + * sites for explanations on the conditions being waited upon and why they are
> + * safe. Transitions out of them into NONE or QUEUED must store_release and the
> + * waiters should load_acquire.
> + *
> + * Tracking scx_ops_state enables sched_ext core to reliably determine whether
> + * any given task can be dispatched by the BPF scheduler at all times and thus
> + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
> + * to try to dispatch any task anytime regardless of its state as the SCX core
> + * can safely reject invalid dispatches.
> + */
> +enum scx_ops_state {
> +	SCX_OPSS_NONE,		/* owned by the SCX core */
> +	SCX_OPSS_QUEUEING,	/* in transit to the BPF scheduler */
> +	SCX_OPSS_QUEUED,	/* owned by the BPF scheduler */
> +	SCX_OPSS_DISPATCHING,	/* in transit back to the SCX core */
> +
> +	/*
> +	 * QSEQ brands each QUEUED instance so that, when dispatch races
> +	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
> +	 * on the task being dispatched.
> +	 *
> +	 * As some 32bit archs can't do 64bit store_release/load_acquire,
> +	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
> +	 * 32bit machines. The dispatch race window QSEQ protects is very narrow
> +	 * and runs with IRQ disabled. 30 bits should be sufficient.
> +	 */
> +	SCX_OPSS_QSEQ_SHIFT	= 2,
> +};
> +
> +/* Use macros to ensure that the type is unsigned long for the masks */
> +#define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
> +#define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
> +
> +/*
> + * During exit, a task may schedule after losing its PIDs. When disabling the
> + * BPF scheduler, we need to be able to iterate tasks in every state to
> + * guarantee system safety. Maintain a dedicated task list which contains every
> + * task between its fork and eventual free.
> + */
> +static DEFINE_SPINLOCK(scx_tasks_lock);
> +static LIST_HEAD(scx_tasks);
> +
> +/* ops enable/disable */
> +static struct kthread_worker *scx_ops_helper;
> +static DEFINE_MUTEX(scx_ops_enable_mutex);
> +DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
> +DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
> +static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
> +static struct sched_ext_ops scx_ops;
> +static bool scx_warned_zero_slice;
> +
> +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
> +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
> +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
> +
> +struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] =
> +	{ [0 ... SCX_NR_ONLINE_OPS-1] = STATIC_KEY_FALSE_INIT };
> +
> +static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
> +static struct scx_exit_info scx_exit_info;
> +
> +/* idle tracking */
> +#ifdef CONFIG_SMP
> +#ifdef CONFIG_CPUMASK_OFFSTACK
> +#define CL_ALIGNED_IF_ONSTACK
> +#else
> +#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
> +#endif
> +
> +static struct {
> +	cpumask_var_t cpu;
> +	cpumask_var_t smt;
> +} idle_masks CL_ALIGNED_IF_ONSTACK;
> +
> +#endif	/* CONFIG_SMP */
> +
> +/*
> + * Direct dispatch marker.
> + *
> + * Non-NULL values are used for direct dispatch from enqueue path. A valid
> + * pointer points to the task currently being enqueued. An ERR_PTR value is used
> + * to indicate that direct dispatch has already happened.
> + */
> +static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
> +
> +/* dispatch queues */
> +static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
> +
> +static const struct rhashtable_params dsq_hash_params = {
> +	.key_len		= 8,
> +	.key_offset		= offsetof(struct scx_dispatch_q, id),
> +	.head_offset		= offsetof(struct scx_dispatch_q, hash_node),
> +};
> +
> +static struct rhashtable dsq_hash;
> +static LLIST_HEAD(dsqs_to_free);
> +
> +/* dispatch buf */
> +struct scx_dsp_buf_ent {
> +	struct task_struct	*task;
> +	unsigned long		qseq;
> +	u64			dsq_id;
> +	u64			enq_flags;
> +};
> +
> +static u32 scx_dsp_max_batch;
> +static struct scx_dsp_buf_ent __percpu *scx_dsp_buf;
> +
> +struct scx_dsp_ctx {
> +	struct rq		*rq;
> +	struct rq_flags		*rf;
> +	u32			buf_cursor;
> +	u32			nr_tasks;
> +};
> +
> +static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
> +
> +void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
> +		      u64 enq_flags);
> +__printf(2, 3) static void scx_ops_error_kind(enum scx_exit_kind kind,
> +					      const char *fmt, ...);
> +#define scx_ops_error(fmt, args...)						\
> +	scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
> +
> +struct scx_task_iter {
> +	struct sched_ext_entity		cursor;
> +	struct task_struct		*locked;
> +	struct rq			*rq;
> +	struct rq_flags			rf;
> +};
> +
> +#define SCX_HAS_OP(op)	static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
> +
> +/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
> +static u32 higher_bits(u32 flags)
> +{
> +	return ~((1 << fls(flags)) - 1);
> +}
> +
> +/* return the mask with only the highest bit set */
> +static u32 highest_bit(u32 flags)
> +{
> +	int bit = fls(flags);
> +	return bit ? 1 << (bit - 1) : 0;
> +}
> +
> +/*
> + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
> + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
> + * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
> + * whether it's running from an allowed context.
> + *
> + * @mask is constant, always inline to cull the mask calculations.
> + */
> +static __always_inline void scx_kf_allow(u32 mask)
> +{
> +	/* nesting is allowed only in increasing scx_kf_mask order */
> +	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
> +		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
> +		  current->scx.kf_mask, mask);
> +	current->scx.kf_mask |= mask;
> +}
> +
> +static void scx_kf_disallow(u32 mask)
> +{
> +	current->scx.kf_mask &= ~mask;
> +}
> +
> +#define SCX_CALL_OP(mask, op, args...)						\
> +do {										\
> +	if (mask) {								\
> +		scx_kf_allow(mask);						\
> +		scx_ops.op(args);						\
> +		scx_kf_disallow(mask);						\
> +	} else {								\
> +		scx_ops.op(args);						\
> +	}									\
> +} while (0)
> +
> +#define SCX_CALL_OP_RET(mask, op, args...)					\
> +({										\
> +	__typeof__(scx_ops.op(args)) __ret;					\
> +	if (mask) {								\
> +		scx_kf_allow(mask);						\
> +		__ret = scx_ops.op(args);					\
> +		scx_kf_disallow(mask);						\
> +	} else {								\
> +		__ret = scx_ops.op(args);					\
> +	}									\
> +	__ret;									\
> +})
> +
> +/* @mask is constant, always inline to cull unnecessary branches */
> +static __always_inline bool scx_kf_allowed(u32 mask)
> +{
> +	if (unlikely(!(current->scx.kf_mask & mask))) {
> +		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
> +			      mask, current->scx.kf_mask);
> +		return false;
> +	}
> +
> +	if (unlikely((mask & (SCX_KF_INIT | SCX_KF_SLEEPABLE)) &&
> +		     in_interrupt())) {
> +		scx_ops_error("sleepable kfunc called from non-sleepable context");
> +		return false;
> +	}
> +
> +	/*
> +	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
> +	 * DISPATCH must not be called if we're running DEQUEUE which is nested
> +	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
> +	 * boundary thanks to the above in_interrupt() check.
> +	 */
> +	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
> +		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
> +		scx_ops_error("dispatch kfunc called from a nested operation");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +/**
> + * scx_task_iter_init - Initialize a task iterator
> + * @iter: iterator to init
> + *
> + * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
> + * @iter must eventually be exited with scx_task_iter_exit().
> + *
> + * scx_tasks_lock may be released between this and the first next() call or
> + * between any two next() calls. If scx_tasks_lock is released between two
> + * next() calls, the caller is responsible for ensuring that the task being
> + * iterated remains accessible either through RCU read lock or obtaining a
> + * reference count.
> + *
> + * All tasks which existed when the iteration started are guaranteed to be
> + * visited as long as they still exist.
> + */
> +static void scx_task_iter_init(struct scx_task_iter *iter)
> +{
> +	lockdep_assert_held(&scx_tasks_lock);
> +
> +	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
> +	list_add(&iter->cursor.tasks_node, &scx_tasks);
> +	iter->locked = NULL;
> +}
> +
> +/**
> + * scx_task_iter_exit - Exit a task iterator
> + * @iter: iterator to exit
> + *
> + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
> + * If the iterator holds a task's rq lock, that rq lock is released. See
> + * scx_task_iter_init() for details.
> + */
> +static void scx_task_iter_exit(struct scx_task_iter *iter)
> +{
> +	struct list_head *cursor = &iter->cursor.tasks_node;
> +
> +	lockdep_assert_held(&scx_tasks_lock);
> +
> +	if (iter->locked) {
> +		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
> +		iter->locked = NULL;
> +	}
> +
> +	if (list_empty(cursor))
> +		return;
> +
> +	list_del_init(cursor);
> +}
> +
> +/**
> + * scx_task_iter_next - Next task
> + * @iter: iterator to walk
> + *
> + * Visit the next task. See scx_task_iter_init() for details.
> + */
> +static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
> +{
> +	struct list_head *cursor = &iter->cursor.tasks_node;
> +	struct sched_ext_entity *pos;
> +
> +	lockdep_assert_held(&scx_tasks_lock);
> +
> +	list_for_each_entry(pos, cursor, tasks_node) {
> +		if (&pos->tasks_node == &scx_tasks)
> +			return NULL;
> +		if (!(pos->flags & SCX_TASK_CURSOR)) {
> +			list_move(cursor, &pos->tasks_node);
> +			return container_of(pos, struct task_struct, scx);
> +		}
> +	}
> +
> +	/* can't happen, should always terminate at scx_tasks above */
> +	BUG();
> +}
> +
> +/**
> + * scx_task_iter_next_filtered - Next non-idle task
> + * @iter: iterator to walk
> + *
> + * Visit the next non-idle task. See scx_task_iter_init() for details.
> + */
> +static struct task_struct *
> +scx_task_iter_next_filtered(struct scx_task_iter *iter)
> +{
> +	struct task_struct *p;
> +
> +	while ((p = scx_task_iter_next(iter))) {
> +		/*
> +		 * is_idle_task() tests %PF_IDLE which may not be set for CPUs
> +		 * which haven't yet been onlined. Test sched_class directly.
> +		 */
> +		if (p->sched_class != &idle_sched_class)
> +			return p;
> +	}
> +	return NULL;
> +}
> +
> +/**
> + * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked
> + * @iter: iterator to walk
> + *
> + * Visit the next non-idle task with its rq lock held. See scx_task_iter_init()
> + * for details.
> + */
> +static struct task_struct *
> +scx_task_iter_next_filtered_locked(struct scx_task_iter *iter)
> +{
> +	struct task_struct *p;
> +
> +	if (iter->locked) {
> +		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
> +		iter->locked = NULL;
> +	}
> +
> +	p = scx_task_iter_next_filtered(iter);
> +	if (!p)
> +		return NULL;
> +
> +	iter->rq = task_rq_lock(p, &iter->rf);
> +	iter->locked = p;
> +	return p;
> +}
> +
> +static enum scx_ops_enable_state scx_ops_enable_state(void)
> +{
> +	return atomic_read(&scx_ops_enable_state_var);
> +}
> +
> +static enum scx_ops_enable_state
> +scx_ops_set_enable_state(enum scx_ops_enable_state to)
> +{
> +	return atomic_xchg(&scx_ops_enable_state_var, to);
> +}
> +
> +static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
> +					enum scx_ops_enable_state from)
> +{
> +	int from_v = from;
> +
> +	return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
> +}
> +
> +static bool scx_ops_disabling(void)
> +{
> +	return unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING);
> +}
> +
> +/**
> + * wait_ops_state - Busy-wait the specified ops state to end
> + * @p: target task
> + * @opss: state to wait the end of
> + *
> + * Busy-wait for @p to transition out of @opss. This can only be used when the
> + * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
> + * has load_acquire semantics to ensure that the caller can see the updates made
> + * in the enqueueing and dispatching paths.
> + */
> +static void wait_ops_state(struct task_struct *p, unsigned long opss)
> +{
> +	do {
> +		cpu_relax();
> +	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
> +}
> +
> +/**
> + * ops_cpu_valid - Verify a cpu number
> + * @cpu: cpu number which came from a BPF ops
> + *
> + * @cpu is a cpu number which came from the BPF scheduler and can be any value.
> + * Verify that it is in range and one of the possible cpus.
> + */
> +static bool ops_cpu_valid(s32 cpu)
> +{
> +	return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
> +}
> +
> +/**
> + * ops_sanitize_err - Sanitize a -errno value
> + * @ops_name: operation to blame on failure
> + * @err: -errno value to sanitize
> + *
> + * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
> + * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
> + * cause misbehaviors. For an example, a large negative return from
> + * ops.prep_enable() triggers an oops when passed up the call chain because the
> + * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
> + * handled as a pointer.
> + */
> +static int ops_sanitize_err(const char *ops_name, s32 err)
> +{
> +	if (err < 0 && err >= -MAX_ERRNO)
> +		return err;
> +
> +	scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
> +	return -EPROTO;
> +}
> +
> +static void update_curr_scx(struct rq *rq)
> +{
> +	struct task_struct *curr = rq->curr;
> +	u64 now = rq_clock_task(rq);
> +	u64 delta_exec;
> +
> +	if (time_before_eq64(now, curr->se.exec_start))
> +		return;
> +
> +	delta_exec = now - curr->se.exec_start;
> +	curr->se.exec_start = now;
> +	curr->se.sum_exec_runtime += delta_exec;
> +	account_group_exec_runtime(curr, delta_exec);
> +	cgroup_account_cputime(curr, delta_exec);
> +
> +	curr->scx.slice -= min(curr->scx.slice, delta_exec);
> +}
> +
> +static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
> +			     u64 enq_flags)
> +{
> +	bool is_local = dsq->id == SCX_DSQ_LOCAL;
> +
> +	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node));
> +
> +	if (!is_local) {
> +		raw_spin_lock(&dsq->lock);
> +		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
> +			scx_ops_error("attempting to dispatch to a destroyed dsq");
> +			/* fall back to the global dsq */
> +			raw_spin_unlock(&dsq->lock);
> +			dsq = &scx_dsq_global;
> +			raw_spin_lock(&dsq->lock);
> +		}
> +	}
> +
> +	if (enq_flags & SCX_ENQ_HEAD)
> +		list_add(&p->scx.dsq_node, &dsq->fifo);
> +	else
> +		list_add_tail(&p->scx.dsq_node, &dsq->fifo);
> +	dsq->nr++;
> +	p->scx.dsq = dsq;
> +
> +	/*
> +	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
> +	 * match waiters' load_acquire.
> +	 */
> +	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
> +		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
> +
> +	if (is_local) {
> +		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
> +
> +		if (sched_class_above(&ext_sched_class, rq->curr->sched_class))
> +			resched_curr(rq);
> +	} else {
> +		raw_spin_unlock(&dsq->lock);
> +	}
> +}
> +
> +static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
> +{
> +	struct scx_dispatch_q *dsq = p->scx.dsq;
> +	bool is_local = dsq == &scx_rq->local_dsq;
> +
> +	if (!dsq) {
> +		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
> +		/*
> +		 * When dispatching directly from the BPF scheduler to a local
> +		 * DSQ, the task isn't associated with any DSQ but
> +		 * @p->scx.holding_cpu may be set under the protection of
> +		 * %SCX_OPSS_DISPATCHING.
> +		 */
> +		if (p->scx.holding_cpu >= 0)
> +			p->scx.holding_cpu = -1;
> +		return;
> +	}
> +
> +	if (!is_local)
> +		raw_spin_lock(&dsq->lock);
> +
> +	/*
> +	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node
> +	 * can't change underneath us.
> +	*/
> +	if (p->scx.holding_cpu < 0) {
> +		/* @p must still be on @dsq, dequeue */
> +		WARN_ON_ONCE(list_empty(&p->scx.dsq_node));
> +		list_del_init(&p->scx.dsq_node);
> +		dsq->nr--;
> +	} else {
> +		/*
> +		 * We're racing against dispatch_to_local_dsq() which already
> +		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
> +		 * holding_cpu which tells dispatch_to_local_dsq() that it lost
> +		 * the race.
> +		 */
> +		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
> +		p->scx.holding_cpu = -1;
> +	}
> +	p->scx.dsq = NULL;
> +
> +	if (!is_local)
> +		raw_spin_unlock(&dsq->lock);
> +}
> +
> +static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
> +{
> +	lockdep_assert(rcu_read_lock_any_held());
> +
> +	if (dsq_id == SCX_DSQ_GLOBAL)
> +		return &scx_dsq_global;
> +	else
> +		return rhashtable_lookup_fast(&dsq_hash, &dsq_id,
> +					      dsq_hash_params);
> +}
> +
> +static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
> +						    struct task_struct *p)
> +{
> +	struct scx_dispatch_q *dsq;
> +
> +	if (dsq_id == SCX_DSQ_LOCAL)
> +		return &rq->scx.local_dsq;
> +
> +	dsq = find_non_local_dsq(dsq_id);
> +	if (unlikely(!dsq)) {
> +		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
> +			      dsq_id, p->comm, p->pid);
> +		return &scx_dsq_global;
> +	}
> +
> +	return dsq;
> +}
> +
> +static void direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p,
> +			    u64 dsq_id, u64 enq_flags)
> +{
> +	struct scx_dispatch_q *dsq;
> +
> +	/* @p must match the task which is being enqueued */
> +	if (unlikely(p != ddsp_task)) {
> +		if (IS_ERR(ddsp_task))
> +			scx_ops_error("%s[%d] already direct-dispatched",
> +				      p->comm, p->pid);
> +		else
> +			scx_ops_error("enqueueing %s[%d] but trying to direct-dispatch %s[%d]",
> +				      ddsp_task->comm, ddsp_task->pid,
> +				      p->comm, p->pid);
> +		return;
> +	}
> +
> +	/*
> +	 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
> +	 * dispatching to the local DSQ of a different CPU requires unlocking
> +	 * the current rq which isn't allowed in the enqueue path. Use
> +	 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
> +	 */
> +	if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
> +		scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
> +		return;
> +	}
> +
> +	dsq = find_dsq_for_dispatch(task_rq(p), dsq_id, p);
> +	dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> +
> +	/*
> +	 * Mark that dispatch already happened by spoiling direct_dispatch_task
> +	 * with a non-NULL value which can never match a valid task pointer.
> +	 */
> +	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
> +}
> +
> +static bool test_rq_online(struct rq *rq)
> +{
> +#ifdef CONFIG_SMP
> +	return rq->online;
> +#else
> +	return true;
> +#endif
> +}
> +
> +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> +			    int sticky_cpu)
> +{
> +	struct task_struct **ddsp_taskp;
> +	unsigned long qseq;
> +
> +	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
> +
> +	if (p->scx.flags & SCX_TASK_ENQ_LOCAL) {
> +		enq_flags |= SCX_ENQ_LOCAL;
> +		p->scx.flags &= ~SCX_TASK_ENQ_LOCAL;
> +	}
> +
> +	/* rq migration */
> +	if (sticky_cpu == cpu_of(rq))
> +		goto local_norefill;
> +
> +	/*
> +	 * If !rq->online, we already told the BPF scheduler that the CPU is
> +	 * offline. We're just trying to on/offline the CPU. Don't bother the
> +	 * BPF scheduler.
> +	 */
> +	if (unlikely(!test_rq_online(rq)))
> +		goto local;
> +
> +	/* see %SCX_OPS_ENQ_EXITING */
> +	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
> +	    unlikely(p->flags & PF_EXITING))
> +		goto local;
> +
> +	/* see %SCX_OPS_ENQ_LAST */
> +	if (!static_branch_unlikely(&scx_ops_enq_last) &&
> +	    (enq_flags & SCX_ENQ_LAST))
> +		goto local;
> +
> +	if (!SCX_HAS_OP(enqueue)) {
> +		if (enq_flags & SCX_ENQ_LOCAL)
> +			goto local;
> +		else
> +			goto global;
> +	}
> +
> +	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
> +	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
> +
> +	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
> +	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
> +
> +	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
> +	WARN_ON_ONCE(*ddsp_taskp);
> +	*ddsp_taskp = p;
> +
> +	SCX_CALL_OP(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
> +
> +	/*
> +	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
> +	 * dequeue may be waiting. The store_release matches their load_acquire.
> +	 */
> +	if (*ddsp_taskp == p)
> +		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
> +	*ddsp_taskp = NULL;
> +	return;
> +
> +local:
> +	p->scx.slice = SCX_SLICE_DFL;
> +local_norefill:
> +	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
> +	return;
> +
> +global:
> +	p->scx.slice = SCX_SLICE_DFL;
> +	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
> +}
> +
> +static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
> +{
> +	int sticky_cpu = p->scx.sticky_cpu;
> +
> +	enq_flags |= rq->scx.extra_enq_flags;
> +
> +	if (sticky_cpu >= 0)
> +		p->scx.sticky_cpu = -1;
> +
> +	/*
> +	 * Restoring a running task will be immediately followed by
> +	 * set_next_task_scx() which expects the task to not be on the BPF
> +	 * scheduler as tasks can only start running through local DSQs. Force
> +	 * direct-dispatch into the local DSQ by setting the sticky_cpu.
> +	 */
> +	if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
> +		sticky_cpu = cpu_of(rq);
> +
> +	if (p->scx.flags & SCX_TASK_QUEUED)
> +		return;
> +
> +	p->scx.flags |= SCX_TASK_QUEUED;
> +	rq->scx.nr_running++;
> +	add_nr_running(rq, 1);
> +
> +	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
> +}
> +
> +static void ops_dequeue(struct task_struct *p, u64 deq_flags)
> +{
> +	unsigned long opss;
> +
> +	/* acquire ensures that we see the preceding updates on QUEUED */
> +	opss = atomic_long_read_acquire(&p->scx.ops_state);
> +
> +	switch (opss & SCX_OPSS_STATE_MASK) {
> +	case SCX_OPSS_NONE:
> +		break;
> +	case SCX_OPSS_QUEUEING:
> +		/*
> +		 * QUEUEING is started and finished while holding @p's rq lock.
> +		 * As we're holding the rq lock now, we shouldn't see QUEUEING.
> +		 */
> +		BUG();
> +	case SCX_OPSS_QUEUED:
> +		if (SCX_HAS_OP(dequeue))
> +			SCX_CALL_OP(SCX_KF_REST, dequeue, p, deq_flags);
> +
> +		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
> +					    SCX_OPSS_NONE))
> +			break;
> +		fallthrough;
> +	case SCX_OPSS_DISPATCHING:
> +		/*
> +		 * If @p is being dispatched from the BPF scheduler to a DSQ,
> +		 * wait for the transfer to complete so that @p doesn't get
> +		 * added to its DSQ after dequeueing is complete.
> +		 *
> +		 * As we're waiting on DISPATCHING with the rq locked, the
> +		 * dispatching side shouldn't try to lock the rq while
> +		 * DISPATCHING is set. See dispatch_to_local_dsq().
> +		 *
> +		 * DISPATCHING shouldn't have qseq set and control can reach
> +		 * here with NONE @opss from the above QUEUED case block.
> +		 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
> +		 */
> +		wait_ops_state(p, SCX_OPSS_DISPATCHING);
> +		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
> +		break;
> +	}
> +}
> +
> +static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
> +{
> +	struct scx_rq *scx_rq = &rq->scx;
> +
> +	if (!(p->scx.flags & SCX_TASK_QUEUED))
> +		return;
> +
> +	ops_dequeue(p, deq_flags);
> +
> +	if (deq_flags & SCX_DEQ_SLEEP)
> +		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
> +	else
> +		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
> +
> +	p->scx.flags &= ~SCX_TASK_QUEUED;
> +	scx_rq->nr_running--;
> +	sub_nr_running(rq, 1);
> +
> +	dispatch_dequeue(scx_rq, p);
> +}
> +
> +static void yield_task_scx(struct rq *rq)
> +{
> +	struct task_struct *p = rq->curr;
> +
> +	if (SCX_HAS_OP(yield))
> +		SCX_CALL_OP_RET(SCX_KF_REST, yield, p, NULL);
> +	else
> +		p->scx.slice = 0;
> +}
> +
> +static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
> +{
> +	struct task_struct *from = rq->curr;
> +
> +	if (SCX_HAS_OP(yield))
> +		return SCX_CALL_OP_RET(SCX_KF_REST, yield, from, to);
> +	else
> +		return false;
> +}
> +
> +#ifdef CONFIG_SMP
> +/**
> + * move_task_to_local_dsq - Move a task from a different rq to a local DSQ
> + * @rq: rq to move the task into, currently locked
> + * @p: task to move
> + * @enq_flags: %SCX_ENQ_*
> + *
> + * Move @p which is currently on a different rq to @rq's local DSQ. The caller
> + * must:
> + *
> + * 1. Start with exclusive access to @p either through its DSQ lock or
> + *    %SCX_OPSS_DISPATCHING flag.
> + *
> + * 2. Set @p->scx.holding_cpu to raw_smp_processor_id().
> + *
> + * 3. Remember task_rq(@p). Release the exclusive access so that we don't
> + *    deadlock with dequeue.
> + *
> + * 4. Lock @rq and the task_rq from #3.
> + *
> + * 5. Call this function.
> + *
> + * Returns %true if @p was successfully moved. %false after racing dequeue and
> + * losing.
> + */
> +static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
> +				   u64 enq_flags)
> +{
> +	struct rq *task_rq;
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	/*
> +	 * If dequeue got to @p while we were trying to lock both rq's, it'd
> +	 * have cleared @p->scx.holding_cpu to -1. While other cpus may have
> +	 * updated it to different values afterwards, as this operation can't be
> +	 * preempted or recurse, @p->scx.holding_cpu can never become
> +	 * raw_smp_processor_id() again before we're done. Thus, we can tell
> +	 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
> +	 * still raw_smp_processor_id().
> +	 *
> +	 * See dispatch_dequeue() for the counterpart.
> +	 */
> +	if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
> +		return false;
> +
> +	/* @p->rq couldn't have changed if we're still the holding cpu */
> +	task_rq = task_rq(p);
> +	lockdep_assert_rq_held(task_rq);
> +
> +	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
> +	deactivate_task(task_rq, p, 0);
> +	set_task_cpu(p, cpu_of(rq));
> +	p->scx.sticky_cpu = cpu_of(rq);
> +
> +	/*
> +	 * We want to pass scx-specific enq_flags but activate_task() will
> +	 * truncate the upper 32 bit. As we own @rq, we can pass them through
> +	 * @rq->scx.extra_enq_flags instead.
> +	 */
> +	WARN_ON_ONCE(rq->scx.extra_enq_flags);
> +	rq->scx.extra_enq_flags = enq_flags;
> +	activate_task(rq, p, 0);
> +	rq->scx.extra_enq_flags = 0;
> +
> +	return true;
> +}
> +
> +/**
> + * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked
> + * @rq: current rq which is locked
> + * @rf: rq_flags to use when unlocking @rq
> + * @src_rq: rq to move task from
> + * @dst_rq: rq to move task to
> + *
> + * We're holding @rq lock and trying to dispatch a task from @src_rq to
> + * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether
> + * @rq stays locked isn't important as long as the state is restored after
> + * dispatch_to_local_dsq_unlock().
> + */
> +static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
> +				       struct rq *src_rq, struct rq *dst_rq)
> +{
> +	rq_unpin_lock(rq, rf);
> +
> +	if (src_rq == dst_rq) {
> +		raw_spin_rq_unlock(rq);
> +		raw_spin_rq_lock(dst_rq);
> +	} else if (rq == src_rq) {
> +		double_lock_balance(rq, dst_rq);
> +		rq_repin_lock(rq, rf);
> +	} else if (rq == dst_rq) {
> +		double_lock_balance(rq, src_rq);
> +		rq_repin_lock(rq, rf);
> +	} else {
> +		raw_spin_rq_unlock(rq);
> +		double_rq_lock(src_rq, dst_rq);
> +	}
> +}
> +
> +/**
> + * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock()
> + * @rq: current rq which is locked
> + * @rf: rq_flags to use when unlocking @rq
> + * @src_rq: rq to move task from
> + * @dst_rq: rq to move task to
> + *
> + * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
> + */
> +static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
> +					 struct rq *src_rq, struct rq *dst_rq)
> +{
> +	if (src_rq == dst_rq) {
> +		raw_spin_rq_unlock(dst_rq);
> +		raw_spin_rq_lock(rq);
> +		rq_repin_lock(rq, rf);
> +	} else if (rq == src_rq) {
> +		double_unlock_balance(rq, dst_rq);
> +	} else if (rq == dst_rq) {
> +		double_unlock_balance(rq, src_rq);
> +	} else {
> +		double_rq_unlock(src_rq, dst_rq);
> +		raw_spin_rq_lock(rq);
> +		rq_repin_lock(rq, rf);
> +	}
> +}
> +#endif	/* CONFIG_SMP */
> +
> +
> +static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
> +			       struct scx_dispatch_q *dsq)
> +{
> +	struct scx_rq *scx_rq = &rq->scx;
> +	struct task_struct *p;
> +	struct rq *task_rq;
> +	bool moved = false;
> +retry:
> +	if (list_empty(&dsq->fifo))
> +		return false;
> +
> +	raw_spin_lock(&dsq->lock);
> +	list_for_each_entry(p, &dsq->fifo, scx.dsq_node) {
> +		task_rq = task_rq(p);
> +		if (rq == task_rq)
> +			goto this_rq;
> +		if (likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
> +		    cpumask_test_cpu(cpu_of(rq), p->cpus_ptr))
> +			goto remote_rq;
> +	}
> +	raw_spin_unlock(&dsq->lock);
> +	return false;
> +
> +this_rq:
> +	/* @dsq is locked and @p is on this rq */
> +	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
> +	list_move_tail(&p->scx.dsq_node, &scx_rq->local_dsq.fifo);
> +	dsq->nr--;
> +	scx_rq->local_dsq.nr++;
> +	p->scx.dsq = &scx_rq->local_dsq;
> +	raw_spin_unlock(&dsq->lock);
> +	return true;
> +
> +remote_rq:
> +#ifdef CONFIG_SMP
> +	/*
> +	 * @dsq is locked and @p is on a remote rq. @p is currently protected by
> +	 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
> +	 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
> +	 * rq lock or fail, do a little dancing from our side. See
> +	 * move_task_to_local_dsq().
> +	 */
> +	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
> +	list_del_init(&p->scx.dsq_node);
> +	dsq->nr--;
> +	p->scx.holding_cpu = raw_smp_processor_id();
> +	raw_spin_unlock(&dsq->lock);
> +
> +	rq_unpin_lock(rq, rf);
> +	double_lock_balance(rq, task_rq);
> +	rq_repin_lock(rq, rf);
> +
> +	moved = move_task_to_local_dsq(rq, p, 0);
> +
> +	double_unlock_balance(rq, task_rq);
> +#endif /* CONFIG_SMP */
> +	if (likely(moved))
> +		return true;
> +	goto retry;
> +}
> +
> +enum dispatch_to_local_dsq_ret {
> +	DTL_DISPATCHED,		/* successfully dispatched */
> +	DTL_LOST,		/* lost race to dequeue */
> +	DTL_NOT_LOCAL,		/* destination is not a local DSQ */
> +	DTL_INVALID,		/* invalid local dsq_id */
> +};
> +
> +/**
> + * dispatch_to_local_dsq - Dispatch a task to a local dsq
> + * @rq: current rq which is locked
> + * @rf: rq_flags to use when unlocking @rq
> + * @dsq_id: destination dsq ID
> + * @p: task to dispatch
> + * @enq_flags: %SCX_ENQ_*
> + *
> + * We're holding @rq lock and want to dispatch @p to the local DSQ identified by
> + * @dsq_id. This function performs all the synchronization dancing needed
> + * because local DSQs are protected with rq locks.
> + *
> + * The caller must have exclusive ownership of @p (e.g. through
> + * %SCX_OPSS_DISPATCHING).
> + */
> +static enum dispatch_to_local_dsq_ret
> +dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
> +		      struct task_struct *p, u64 enq_flags)
> +{
> +	struct rq *src_rq = task_rq(p);
> +	struct rq *dst_rq;
> +
> +	/*
> +	 * We're synchronized against dequeue through DISPATCHING. As @p can't
> +	 * be dequeued, its task_rq and cpus_allowed are stable too.
> +	 */
> +	if (dsq_id == SCX_DSQ_LOCAL) {
> +		dst_rq = rq;
> +	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
> +		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
> +
> +		if (!ops_cpu_valid(cpu)) {
> +			scx_ops_error("invalid cpu %d in SCX_DSQ_LOCAL_ON verdict for %s[%d]",
> +				      cpu, p->comm, p->pid);
> +			return DTL_INVALID;
> +		}
> +		dst_rq = cpu_rq(cpu);
> +	} else {
> +		return DTL_NOT_LOCAL;
> +	}
> +
> +	/* if dispatching to @rq that @p is already on, no lock dancing needed */
> +	if (rq == src_rq && rq == dst_rq) {
> +		dispatch_enqueue(&dst_rq->scx.local_dsq, p,
> +				 enq_flags | SCX_ENQ_CLEAR_OPSS);
> +		return DTL_DISPATCHED;
> +	}
> +
> +#ifdef CONFIG_SMP
> +	if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) {
> +		struct rq *locked_dst_rq = dst_rq;
> +		bool dsp;
> +
> +		/*
> +		 * @p is on a possibly remote @src_rq which we need to lock to
> +		 * move the task. If dequeue is in progress, it'd be locking
> +		 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq
> +		 * lock while holding DISPATCHING.
> +		 *
> +		 * As DISPATCHING guarantees that @p is wholly ours, we can
> +		 * pretend that we're moving from a DSQ and use the same
> +		 * mechanism - mark the task under transfer with holding_cpu,
> +		 * release DISPATCHING and then follow the same protocol.
> +		 */
> +		p->scx.holding_cpu = raw_smp_processor_id();
> +
> +		/* store_release ensures that dequeue sees the above */
> +		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
> +
> +		dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq);
> +
> +		/*
> +		 * We don't require the BPF scheduler to avoid dispatching to
> +		 * offline CPUs mostly for convenience but also because CPUs can
> +		 * go offline between scx_bpf_dispatch() calls and here. If @p
> +		 * is destined to an offline CPU, queue it on its current CPU
> +		 * instead, which should always be safe. As this is an allowed
> +		 * behavior, don't trigger an ops error.
> +		 */
> +		if (unlikely(!test_rq_online(dst_rq)))
> +			dst_rq = src_rq;
> +
> +		if (src_rq == dst_rq) {
> +			/*
> +			 * As @p is staying on the same rq, there's no need to
> +			 * go through the full deactivate/activate cycle.
> +			 * Optimize by abbreviating the operations in
> +			 * move_task_to_local_dsq().
> +			 */
> +			dsp = p->scx.holding_cpu == raw_smp_processor_id();
> +			if (likely(dsp)) {
> +				p->scx.holding_cpu = -1;
> +				dispatch_enqueue(&dst_rq->scx.local_dsq, p,
> +						 enq_flags);
> +			}
> +		} else {
> +			dsp = move_task_to_local_dsq(dst_rq, p, enq_flags);
> +		}
> +
> +		/* if the destination CPU is idle, wake it up */
> +		if (dsp && p->sched_class > dst_rq->curr->sched_class)
> +			resched_curr(dst_rq);
> +
> +		dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq);
> +
> +		return dsp ? DTL_DISPATCHED : DTL_LOST;
> +	}
> +#endif /* CONFIG_SMP */
> +
> +	scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
> +		      cpu_of(dst_rq), p->comm, p->pid);
> +	return DTL_INVALID;
> +}
> +
> +/**
> + * finish_dispatch - Asynchronously finish dispatching a task
> + * @rq: current rq which is locked
> + * @rf: rq_flags to use when unlocking @rq
> + * @p: task to finish dispatching
> + * @qseq_at_dispatch: qseq when @p started getting dispatched
> + * @dsq_id: destination DSQ ID
> + * @enq_flags: %SCX_ENQ_*
> + *
> + * Dispatching to local DSQs may need to wait for queueing to complete or
> + * require rq lock dancing. As we don't wanna do either while inside
> + * ops.dispatch() to avoid locking order inversion, we split dispatching into
> + * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
> + * task and its qseq. Once ops.dispatch() returns, this function is called to
> + * finish up.
> + *
> + * There is no guarantee that @p is still valid for dispatching or even that it
> + * was valid in the first place. Make sure that the task is still owned by the
> + * BPF scheduler and claim the ownership before dispatching.
> + */
> +static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
> +			    struct task_struct *p,
> +			    unsigned long qseq_at_dispatch,
> +			    u64 dsq_id, u64 enq_flags)
> +{
> +	struct scx_dispatch_q *dsq;
> +	unsigned long opss;
> +
> +retry:
> +	/*
> +	 * No need for _acquire here. @p is accessed only after a successful
> +	 * try_cmpxchg to DISPATCHING.
> +	 */
> +	opss = atomic_long_read(&p->scx.ops_state);
> +
> +	switch (opss & SCX_OPSS_STATE_MASK) {
> +	case SCX_OPSS_DISPATCHING:
> +	case SCX_OPSS_NONE:
> +		/* someone else already got to it */
> +		return;
> +	case SCX_OPSS_QUEUED:
> +		/*
> +		 * If qseq doesn't match, @p has gone through at least one
> +		 * dispatch/dequeue and re-enqueue cycle between
> +		 * scx_bpf_dispatch() and here and we have no claim on it.
> +		 */
> +		if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
> +			return;
> +
> +		/*
> +		 * While we know @p is accessible, we don't yet have a claim on
> +		 * it - the BPF scheduler is allowed to dispatch tasks
> +		 * spuriously and there can be a racing dequeue attempt. Let's
> +		 * claim @p by atomically transitioning it from QUEUED to
> +		 * DISPATCHING.
> +		 */
> +		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
> +						   SCX_OPSS_DISPATCHING)))
> +			break;
> +		goto retry;
> +	case SCX_OPSS_QUEUEING:
> +		/*
> +		 * do_enqueue_task() is in the process of transferring the task
> +		 * to the BPF scheduler while holding @p's rq lock. As we aren't
> +		 * holding any kernel or BPF resource that the enqueue path may
> +		 * depend upon, it's safe to wait.
> +		 */
> +		wait_ops_state(p, opss);
> +		goto retry;
> +	}
> +
> +	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
> +
> +	switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) {
> +	case DTL_DISPATCHED:
> +		break;
> +	case DTL_LOST:
> +		break;
> +	case DTL_INVALID:
> +		dsq_id = SCX_DSQ_GLOBAL;
> +		fallthrough;
> +	case DTL_NOT_LOCAL:
> +		dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()),
> +					    dsq_id, p);
> +		dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
> +		break;
> +	}
> +}
> +
> +static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
> +{
> +	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
> +	u32 u;
> +
> +	for (u = 0; u < dspc->buf_cursor; u++) {
> +		struct scx_dsp_buf_ent *ent = &this_cpu_ptr(scx_dsp_buf)[u];
> +
> +		finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id,
> +				ent->enq_flags);
> +	}
> +
> +	dspc->nr_tasks += dspc->buf_cursor;
> +	dspc->buf_cursor = 0;
> +}
> +
> +static int balance_scx(struct rq *rq, struct task_struct *prev,
> +		       struct rq_flags *rf)
> +{
> +	struct scx_rq *scx_rq = &rq->scx;
> +	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
> +	bool prev_on_scx = prev->sched_class == &ext_sched_class;
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	if (prev_on_scx) {
> +		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
> +		update_curr_scx(rq);
> +
> +		/*
> +		 * If @prev is runnable & has slice left, it has priority and
> +		 * fetching more just increases latency for the fetched tasks.
> +		 * Tell put_prev_task_scx() to put @prev on local_dsq.
> +		 *
> +		 * See scx_ops_disable_workfn() for the explanation on the
> +		 * disabling() test.
> +		 */
> +		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
> +		    prev->scx.slice && !scx_ops_disabling()) {
> +			prev->scx.flags |= SCX_TASK_BAL_KEEP;
> +			return 1;
> +		}
> +	}
> +
> +	/* if there already are tasks to run, nothing to do */
> +	if (scx_rq->local_dsq.nr)
> +		return 1;
> +
> +	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
> +		return 1;
> +
> +	if (!SCX_HAS_OP(dispatch))
> +		return 0;
> +
> +	dspc->rq = rq;
> +	dspc->rf = rf;
> +
> +	/*
> +	 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
> +	 * the local DSQ might still end up empty after a successful
> +	 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
> +	 * produced some tasks, retry. The BPF scheduler may depend on this
> +	 * looping behavior to simplify its implementation.
> +	 */
> +	do {
> +		dspc->nr_tasks = 0;
> +
> +		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
> +			    prev_on_scx ? prev : NULL);
> +
> +		flush_dispatch_buf(rq, rf);
> +
> +		if (scx_rq->local_dsq.nr)
> +			return 1;
> +		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
> +			return 1;
> +	} while (dspc->nr_tasks);
> +
> +	return 0;
> +}
> +
> +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
> +{
> +	if (p->scx.flags & SCX_TASK_QUEUED) {
> +		WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);

Ditto. Even if this line is replaced later by
"[PATCH 31/36] sched_ext: Implement core-sched support"

> +		dispatch_dequeue(&rq->scx, p);
> +	}
> +
> +	p->se.exec_start = rq_clock_task(rq);
> +}

Thanks,
-Andrea