Re: [PATCH sched_ext/for-6.11 2/2] sched_ext: Implement scx_bpf_consume_task()

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Thu, 27 Jun 2024 18:34:14 -0700

On Thu, Jun 27, 2024 at 5:24 PM Tejun Heo <tj@xxxxxxxxxx> wrote:
>
> Implement scx_bpf_consume_task() which allows consuming arbitrary tasks on
> the DSQ in any order while iterating in the dispatch path.
>
> scx_qmap is updated to implement periodic dumping of the shared DSQ and a
> rather silly prioritization mechanism to demonstrate the use of DSQ
> iteration and selective consumption.
>
> Note that it does a bit of nastry dance to pass in the pointer to the
> iterator to __scx_bpf_consume_task(). This is to work around the current
> limitation in the BPF verifier where it doesn't allow the memory area used
> for an iterator to be passed into kfuncs. This may be too nasty and might
> require a different approach.
>
> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
> Reviewed-by: David Vernet <dvernet@xxxxxxxx>
> Cc: Alexei Starovoitov <ast@xxxxxxxxxx>
> Cc: bpf@xxxxxxxxxxxxxxx
> ---
> Hello, again.
>
> (continuing from the previous patch) so, the problem is that I need to
> distinguish the tasks which have left a queue and then get requeued while an
> iteration is in progress. The iterator itself already does this - it
> remembers a sequence number when iteration starts and ignores tasks which
> are queued afterwards.
>
> As a task can get removed and requeued anytime, I need
> scx_bpf_consume_task() to do the same testing, so I want to pass in the
> iterator pointer into scx_bpf_consume_task() so that it can read the
> sequence number stored in the iterator. However, BPF doesn't allow this, so
> I'm doing the weird self pointer probe read thing, to obtain it, which is
> quite nasty.
>
> What do you think?
>
> Thanks.
>
>  kernel/sched/ext.c                       |   89 +++++++++++++++++++++++++++++--
>  tools/sched_ext/include/scx/common.bpf.h |   16 +++++
>  tools/sched_ext/scx_qmap.bpf.c           |   34 ++++++++++-
>  tools/sched_ext/scx_qmap.c               |   14 +++-
>  4 files changed, 142 insertions(+), 11 deletions(-)
>
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1122,6 +1122,12 @@ enum scx_dsq_iter_flags {
>  };
>
>  struct bpf_iter_scx_dsq_kern {
> +       /*
> +        * Must be the first field. Used to work around BPF restriction and pass
> +        * in the iterator pointer to scx_bpf_consume_task().
> +        */
> +       struct bpf_iter_scx_dsq_kern    *self;
> +
>         struct scx_dsq_node             cursor;
>         struct scx_dispatch_q           *dsq;
>         u32                             dsq_seq;
> @@ -1518,7 +1524,7 @@ static void dispatch_enqueue(struct scx_
>         p->scx.dsq_seq = dsq->seq;
>
>         dsq_mod_nr(dsq, 1);
> -       p->scx.dsq = dsq;
> +       WRITE_ONCE(p->scx.dsq, dsq);
>
>         /*
>          * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
> @@ -1611,7 +1617,7 @@ static void dispatch_dequeue(struct rq *
>                 WARN_ON_ONCE(task_linked_on_dsq(p));
>                 p->scx.holding_cpu = -1;
>         }
> -       p->scx.dsq = NULL;
> +       WRITE_ONCE(p->scx.dsq, NULL);
>
>         if (!is_local)
>                 raw_spin_unlock(&dsq->lock);
> @@ -2107,7 +2113,7 @@ static void consume_local_task(struct rq
>         list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list);
>         dsq_mod_nr(dsq, -1);
>         dsq_mod_nr(&rq->scx.local_dsq, 1);
> -       p->scx.dsq = &rq->scx.local_dsq;
> +       WRITE_ONCE(p->scx.dsq, &rq->scx.local_dsq);
>         raw_spin_unlock(&dsq->lock);
>  }
>
> @@ -5585,12 +5591,88 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq
>         }
>  }
>
> +/**
> + * __scx_bpf_consume_task - Transfer a task from DSQ iteration to the local DSQ
> + * @it: DSQ iterator in progress
> + * @p: task to consume
> + *
> + * Transfer @p which is on the DSQ currently iterated by @it to the current
> + * CPU's local DSQ. For the transfer to be successful, @p must still be on the
> + * DSQ and have been queued before the DSQ iteration started. This function
> + * doesn't care whether @p was obtained from the DSQ iteration. @p just has to
> + * be on the DSQ and have been queued before the iteration started.
> + *
> + * Returns %true if @p has been consumed, %false if @p had already been consumed
> + * or dequeued.
> + */
> +__bpf_kfunc bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p)
> +{
> +       struct bpf_iter_scx_dsq_kern *kit = (void *)it;
> +       struct scx_dispatch_q *dsq, *kit_dsq;
> +       struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
> +       struct rq *task_rq;
> +       u64 kit_dsq_seq;
> +
> +       /* can't trust @kit, carefully fetch the values we need */
> +       if (get_kernel_nofault(kit_dsq, &kit->dsq) ||
> +           get_kernel_nofault(kit_dsq_seq, &kit->dsq_seq)) {
> +               scx_ops_error("invalid @it 0x%lx", it);
> +               return false;
> +       }

With scx_bpf_consume_task() it's only a compile time protection from bugs.
Since kfunc doesn't dereference any field in kit_dsq it won't crash
immediately, but let's figure out how to make it work properly.

Since kit_dsq and kit_dsq_seq are pretty much anything in this implementation
can they be passed as two scalars instead ?
I guess not, since tricking dsq != kit_dsq and
time_after64(..,kit_dsq_seq) can lead to real issues ?

Can some of it be mitigated by passing dsq into kfunc that
was used to init the iter ?
Then kfunc will read dsq->seq from it instead of kit->dsq_seq ?

> +
> +       /*
> +        * @kit can't be trusted and we can only get the DSQ from @p. As we
> +        * don't know @p's rq is locked, use READ_ONCE() to access the field.
> +        * Derefing is safe as DSQs are RCU protected.
> +        */
> +       dsq = READ_ONCE(p->scx.dsq);
> +
> +       if (unlikely(!dsq || dsq != kit_dsq))
> +               return false;
> +
> +       if (unlikely(dsq->id == SCX_DSQ_LOCAL)) {
> +               scx_ops_error("local DSQ not allowed");
> +               return false;
> +       }
> +
> +       if (!scx_kf_allowed(SCX_KF_DISPATCH))
> +               return false;
> +
> +       flush_dispatch_buf(dspc->rq, dspc->rf);
> +
> +       raw_spin_lock(&dsq->lock);
> +
> +       /*
> +        * Did someone else get to it? @p could have already left $dsq, got
> +        * re-enqueud, or be in the process of being consumed by someone else.
> +        */
> +       if (unlikely(p->scx.dsq != dsq ||
> +                    time_after64(p->scx.dsq_seq, kit_dsq_seq) ||

In the previous patch you do:
(s32)(p->scx.dsq_seq - kit->dsq_seq) > 0
and here
time_after64().
Close enough, but 32 vs 64 and equality difference?

> +                    p->scx.holding_cpu >= 0))
> +               goto out_unlock;
> +
> +       task_rq = task_rq(p);
> +
> +       if (dspc->rq == task_rq) {
> +               consume_local_task(dspc->rq, dsq, p);
> +               return true;
> +       }
> +
> +       if (task_can_run_on_remote_rq(p, dspc->rq))
> +               return consume_remote_task(dspc->rq, dspc->rf, dsq, p, task_rq);
> +
> +out_unlock:
> +       raw_spin_unlock(&dsq->lock);
> +       return false;
> +}
> +
>  __bpf_kfunc_end_defs();