Re: [PATCH v3 bpf-next 2/3] bpf: Introduce task_vma open-coded iterator kfuncs

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Tue, 22 Aug 2023 16:52:22 -0700

On Mon, Aug 21, 2023 at 10:06 PM Dave Marchevsky <davemarchevsky@xxxxxx> wrote:
>
> This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
> creation and manipulation of struct bpf_iter_task_vma in open-coded
> iterator style. BPF programs can use these kfuncs directly or through
> bpf_for_each macro for natural-looking iteration of all task vmas.
>
> The implementation borrows heavily from bpf_find_vma helper's locking -
> differing only in that it holds the mmap_read lock for all iterations
> while the helper only executes its provided callback on a maximum of 1
> vma. Aside from locking, struct vma_iterator and vma_next do all the
> heavy lifting.
>
> The newly-added struct bpf_iter_task_vma has a name collision with a
> selftest for the seq_file task_vma iter's bpf skel, so the selftests/bpf/progs
> file is renamed in order to avoid the collision.
>
> A pointer to an inner data struct, struct bpf_iter_task_vma_kern_data, is the
> only field in struct bpf_iter_task_vma. This is because the inner data
> struct contains a struct vma_iterator (not ptr), whose size is likely to
> change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
> such a change would require change in opaque bpf_iter_task_vma struct's
> size. So better to allocate vma_iterator using BPF allocator, and since
> that alloc must already succeed, might as well allocate all iter fields,
> thereby freezing struct bpf_iter_task_vma size.
>
> Signed-off-by: Dave Marchevsky <davemarchevsky@xxxxxx>
> Cc: Nathan Slingerland <slinger@xxxxxxxx>
> ---
>  include/uapi/linux/bpf.h                      |  4 +
>  kernel/bpf/helpers.c                          |  3 +
>  kernel/bpf/task_iter.c                        | 84 +++++++++++++++++++
>  tools/include/uapi/linux/bpf.h                |  4 +
>  tools/lib/bpf/bpf_helpers.h                   |  8 ++
>  .../selftests/bpf/prog_tests/bpf_iter.c       | 26 +++---
>  ...f_iter_task_vma.c => bpf_iter_task_vmas.c} |  0
>  7 files changed, 116 insertions(+), 13 deletions(-)
>  rename tools/testing/selftests/bpf/progs/{bpf_iter_task_vma.c => bpf_iter_task_vmas.c} (100%)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 8790b3962e4b..49fc1989a548 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>         __u64 __opaque[1];
>  } __attribute__((aligned(8)));
>
> +struct bpf_iter_task_vma {
> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
> +} __attribute__((aligned(8)));
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index eb91cae0612a..7a06dea749f1 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -2482,6 +2482,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
>  BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
>  BTF_ID_FLAGS(func, bpf_dynptr_adjust)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_null)
>  BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index c4ab9d6cdbe9..51c2dce435c1 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -7,7 +7,9 @@
>  #include <linux/fs.h>
>  #include <linux/fdtable.h>
>  #include <linux/filter.h>
> +#include <linux/bpf_mem_alloc.h>
>  #include <linux/btf_ids.h>
> +#include <linux/mm_types.h>
>  #include "mmap_unlock_work.h"
>
>  static const char * const iter_task_type_names[] = {
> @@ -823,6 +825,88 @@ const struct bpf_func_proto bpf_find_vma_proto = {
>         .arg5_type      = ARG_ANYTHING,
>  };
>
> +struct bpf_iter_task_vma_kern_data {
> +       struct task_struct *task;
> +       struct mm_struct *mm;
> +       struct mmap_unlock_irq_work *work;
> +       struct vma_iterator vmi;
> +};
> +
> +/* Non-opaque version of uapi bpf_iter_task_vma */
> +struct bpf_iter_task_vma_kern {
> +       struct bpf_iter_task_vma_kern_data *data;
> +} __attribute__((aligned(8)));
> +

it's a bit worrying that we'll rely on memory allocation inside NMI to
be able to use this. I'm missing previous email discussion (I declared
email bankruptcy after long vacation), but I suppose the option to fix
bpf_iter_task_vma (to 88 bytes: 64 for vma_iterator + 24 for extra
pointers), or even to 96 to have a bit of headroom in case we need a
bit more space was rejected? It seems unlikely that vma_iterator will
have to grow, but if it does, it has 5 bytes of padding right now for
various flags, plus we can have extra 8 bytes reserved just in case.

I know it's a big struct and will take a big chunk of the BPF stack,
but I'm a bit worried about both the performance implication of mem
alloc under NMI, and allocation failing.

Maybe the worry is overblown, but I thought I'll bring it up anyways.

> +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> +                                     struct task_struct *task, u64 addr)
> +{
> +       struct bpf_iter_task_vma_kern *kit = (void *)it;
> +       bool irq_work_busy = false;
> +       int err;
> +

[...]

>  static void do_mmap_read_unlock(struct irq_work *entry)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 8790b3962e4b..49fc1989a548 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -7311,4 +7311,8 @@ struct bpf_iter_num {
>         __u64 __opaque[1];
>  } __attribute__((aligned(8)));
>
> +struct bpf_iter_task_vma {
> +       __u64 __opaque[1]; /* See bpf_iter_num comment above */
> +} __attribute__((aligned(8)));
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
> index bbab9ad9dc5a..d885ffee4d88 100644
> --- a/tools/lib/bpf/bpf_helpers.h
> +++ b/tools/lib/bpf/bpf_helpers.h
> @@ -302,6 +302,14 @@ extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak
>  extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym;
>  extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym;
>
> +struct bpf_iter_task_vma;
> +
> +extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> +                                struct task_struct *task,
> +                                unsigned long addr) __weak __ksym;
> +extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __weak __ksym;
> +extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __weak __ksym;

my intent wasn't to add all open-coded iterators to bpf_helpers.h. I
think bpf_iter_num_* is rather an exception and isn't supposed to ever
change or be removed, while other iterators should be allowed to be
changed.

The goal is for all such kfuncs (and struct bpf_iter_task_vma state
itself, probably) to come from vmlinux.h, eventually, so let's leave
it out of libbpf's stable bpf_helpers.h header.

[...]

> @@ -1533,7 +1533,7 @@ static void test_task_vma_dead_task(void)
>  out:
>         waitpid(child_pid, &wstatus, 0);
>         close(iter_fd);
> -       bpf_iter_task_vma__destroy(skel);
> +       bpf_iter_task_vmas__destroy(skel);
>  }
>
>  void test_bpf_sockmap_map_iter_fd(void)
> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
> similarity index 100%
> rename from tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
> rename to tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c
> --
> 2.34.1
>