Re: [PATCH v2 bpf-next 4/8] bpf: implement number iterator

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Tue, 7 Mar 2023 14:14:29 -0800

On Tue, Mar 7, 2023 at 1:53 PM Andrii Nakryiko <andrii@xxxxxxxxxx> wrote:
>
> Implement first open-coded iterator type over a range of integers.
>
> It's public API consists of:
>   - bpf_iter_num_new() constructor, which accepts [start, end) range
>     (that is, start is inclusive, end is exclusive).
>   - bpf_iter_num_next() which will keep returning read-only pointer to int
>     until the range is exhausted, at which point NULL will be returned.
>     If bpf_iter_num_next() is kept calling after this, NULL will be
>     persistently returned.
>   - bpf_iter_num_destroy() destructor, which needs to be called at some
>     point to clean up iterator state. BPF verifier enforces that iterator
>     destructor is called at some point before BPF program exits.
>
> Note that `start = end = X` is a valid combination to setup empty
> iterator. bpf_iter_num_new() will return 0 (success) for any such
> combination.
>
> If bpf_iter_num_new() detects invalid combination of input arguments, it
> returns error, resets iterator state to, effectively, empty iterator, so
> any subsequent call to bpf_iter_num_next() will keep returning NULL.
>
> BPF verifier has no knowledge that returned integers are in the
> [start, end) value range, as both `start` and `end` are not statically
> known/enforced, they are runtime values only.
>
> While implementation is pretty trivial, some care needs to be taken to
> avoid overflows and underflows. Subsequent selftests will validate
> correctness of [start, end) semantics, especially around extremes
> (INT_MIN and INT_MAX).
>
> Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can
> be specified.
>
> bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded
> BPF loops and bpf_loop() helper and is the basis for implementing
> ergonomic BPF loops with no statically known or verified bounds.
> Subsequent patches implement bpf_for() macro, demonstrating how this can
> be wrapped into something that works and feels like a normal for() loop
> in C language.
>
> Signed-off-by: Andrii Nakryiko <andrii@xxxxxxxxxx>
> ---
>  include/linux/bpf.h            |  8 +++-
>  include/uapi/linux/bpf.h       |  8 ++++
>  kernel/bpf/bpf_iter.c          | 70 ++++++++++++++++++++++++++++++++++
>  kernel/bpf/helpers.c           |  3 ++
>  kernel/bpf/verifier.c          |  6 +++
>  tools/include/uapi/linux/bpf.h |  8 ++++
>  6 files changed, 101 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 6792a7940e1e..e64ff1e89fb2 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1617,8 +1617,12 @@ struct bpf_array {
>  #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
>  #define MAX_TAIL_CALL_CNT 33
>
> -/* Maximum number of loops for bpf_loop */
> -#define BPF_MAX_LOOPS  BIT(23)
> +/* Maximum number of loops for bpf_loop and bpf_iter_num.
> + * It's enum to expose it (and thus make it discoverable) through BTF.
> + */
> +enum {
> +       BPF_MAX_LOOPS = 8 * 1024 * 1024,
> +};
>
>  #define BPF_F_ACCESS_MASK      (BPF_F_RDONLY |         \
>                                  BPF_F_RDONLY_PROG |    \
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 976b194eb775..bf8b77d9a17e 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7112,4 +7112,12 @@ enum {
>         BPF_F_TIMER_ABS = (1ULL << 0),
>  };
>
> +/* BPF numbers iterator state */
> +struct bpf_iter_num {
> +       /* opaque iterator state; having __u64 here allows to preserve correct
> +        * alignment requirements in vmlinux.h, generated from BTF
> +        */
> +       __u64 __opaque[1];
> +};
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 5dc307bdeaeb..96856f130cbf 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
>         .arg3_type      = ARG_PTR_TO_STACK_OR_NULL,
>         .arg4_type      = ARG_ANYTHING,
>  };
> +
> +struct bpf_iter_num_kern {
> +       int cur; /* current value, inclusive */
> +       int end; /* final value, exclusive */
> +} __aligned(8);
> +
> +__diag_push();
> +__diag_ignore_all("-Wmissing-prototypes",
> +                 "Global functions as their definitions will be in vmlinux BTF");
> +
> +__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
> +{
> +       struct bpf_iter_num_kern *s = (void *)it;
> +
> +       BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
> +       BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
> +
> +       BTF_TYPE_EMIT(struct btf_iter_num);
> +
> +       /* start == end is legit, it's an empty range and we'll just get NULL
> +        * on first (and any subsequent) bpf_iter_num_next() call
> +        */
> +       if (start > end) {
> +               s->cur = s->end = 0;
> +               return -EINVAL;
> +       }
> +
> +       /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
> +       if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
> +               s->cur = s->end = 0;
> +               return -E2BIG;
> +       }
> +
> +       /* user will call bpf_iter_num_next() first,
> +        * which will set s->cur to exactly start value;
> +        * underflow shouldn't matter
> +        */
> +       s->cur = start - 1;
> +       s->end = end;
> +
> +       return 0;
> +}
> +
> +__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
> +{
> +       struct bpf_iter_num_kern *s = (void *)it;
> +
> +       /* check failed initialization or if we are done (same behavior);
> +        * need to be careful about overflow, so convert to s64 for checks,
> +        * e.g., if s->cur == s->end == INT_MAX, we can't just do
> +        * s->cur + 1 >= s->end
> +        */
> +       if ((s64)(s->cur + 1) >= s->end) {
> +               s->cur = s->end = 0;
> +               return NULL;
> +       }
> +
> +       s->cur++;
> +
> +       return &s->cur;
> +}
> +
> +__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
> +{
> +       struct bpf_iter_num_kern *s = (void *)it;
> +
> +       s->cur = s->end = 0;
> +}
> +
> +__diag_pop();
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 637ac4e92e75..f9b7eeedce08 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2411,6 +2411,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock)
>  BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
>  BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
> +BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
>  BTF_SET8_END(common_btf_ids)
>
>  static const struct btf_kfunc_id_set common_kfunc_set = {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index d1246d187fac..33c305b563f6 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -9542,6 +9542,9 @@ enum special_kfunc_type {
>         KF_bpf_dynptr_from_xdp,
>         KF_bpf_dynptr_slice,
>         KF_bpf_dynptr_slice_rdwr,
> +       KF_bpf_iter_num_new,
> +       KF_bpf_iter_num_next,
> +       KF_bpf_iter_num_destroy,

don't really need this, this is a leftover from v1, missed it. If this
patch set is going to be applied as is, please strip these three rows.

>  };
>
>  BTF_SET_START(special_kfunc_set)
> @@ -9580,6 +9583,9 @@ BTF_ID(func, bpf_dynptr_from_skb)
>  BTF_ID(func, bpf_dynptr_from_xdp)
>  BTF_ID(func, bpf_dynptr_slice)
>  BTF_ID(func, bpf_dynptr_slice_rdwr)
> +BTF_ID(func, bpf_iter_num_new)
> +BTF_ID(func, bpf_iter_num_next)
> +BTF_ID(func, bpf_iter_num_destroy)
>
>  static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
>  {
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 976b194eb775..bf8b77d9a17e 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -7112,4 +7112,12 @@ enum {
>         BPF_F_TIMER_ABS = (1ULL << 0),
>  };
>
> +/* BPF numbers iterator state */
> +struct bpf_iter_num {
> +       /* opaque iterator state; having __u64 here allows to preserve correct
> +        * alignment requirements in vmlinux.h, generated from BTF
> +        */
> +       __u64 __opaque[1];
> +};
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> --
> 2.34.1
>