Re: struct_ops usage - verifier interactions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Mar 5, 2024 at 8:08 PM Isaac Khor <contact@xxxxxxxxxxxxx> wrote:
>
> I’m a phd student working on adding a new struct_ops type for a potential project, and I have a few questions about its usage.
>
> The background is that we’re thinking of delegating some kernel heuristics to bpf programs, and we’re starting with the readahead logic. The prototype right now just adds an ugly struct_ops right in the middle of the block read path (specifically in read_pages just as a test, will probably split out the read vs fault path later) as a test. The patch I have simply packages up a few readahead control variables already present into a new `bpf_fs_readahead_state` struct and passes that to the struct_ops bpf program.
>
> The problem, however, is that the verifier rejects a program that is literally nothing but `return state->ra_pages`. Any dereference of the passed-in struct lead to a verifier rejection with `R1 invalid mem access ‘scalar’`, complaining that the passed-in pointer is invalid. The register, when passed to `is_valid_access`, is already of scalar type, so even a `return true` `is_valid_access` still leads to the rejection.
>
> I’m confused about why the pointer is not a `PTR_TO_BTF_ID`, which is the case when looking at calls to bpf_cong’s `is_valid_access` when loading bpf_cubic.c. I’m reviewing the fuse-bpf and sched_ext patches, and I don’t see where I need to register my new state struct with the verifier, if that’s indeed what I need to do for the verifier to correctly identify the pointer in `check_mem_access`. I see references to special pointer types for tcp_sock in the case of tcp_cong, but not anything for the fuse-bpf patches.
>
> If there’s documentation somewhere or sample code I should consult please let me know as well.
>
> Here’s my (very ugly, just trying to get it to work) patch is below, along with the program I’m trying to load and a verification log.
>
> ---
>  include/linux/bpf.h |  21 +++++++++
>  mm/Makefile         |   3 ++
>  mm/readahead.c      |  42 +++++++++++++++++
>  mm/readahead_ops.c  | 112 ++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 178 insertions(+)
>  create mode 100644 mm/readahead_ops.c
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index c7aa99b44d..0e461d532c 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1799,6 +1799,27 @@ struct bpf_dummy_ops {
>         int (*test_sleepable)(struct bpf_dummy_ops_state *cb);
>  };
>
> +// KNN: put it here for now for lack of a better place
> +
> +struct bpf_fs_readahead_state {
> +       int i_ino;
> +       unsigned int size;
> +       unsigned int async_size;
> +       unsigned int ra_pages;
> +       unsigned int mmap_miss;
> +       loff_t prev_pos;
> +};
> +
> +struct bpf_fs_readahead_ops {
> +       int (*get_max_ra)(struct bpf_fs_readahead_state *cb);
> +       int (*get_ra)(struct bpf_fs_readahead_state *cb);
> +};
> +
> +// temporarily globals as we get this working
> +
> +int bpf_fs_ra_register(struct bpf_fs_readahead_ops *ops);
> +void bpf_fs_ra_unregister(struct bpf_fs_readahead_ops *ops);
> +
>  int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
>                             union bpf_attr __user *uattr);
>  #endif
> diff --git a/mm/Makefile b/mm/Makefile
> index e4b5b75aae..ccd4287286 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -134,3 +134,6 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
>  obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
>  obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
>  obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> +
> +# TODO use bpf for now
> +obj-$(CONFIG_BPF) += readahead_ops.o
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 23620c57c1..a2c88fb629 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -113,6 +113,7 @@
>   * ->read_folio() which may be less efficient.
>   */
>
> +#include "linux/bpf.h"
>  #include <linux/blkdev.h>
>  #include <linux/kernel.h>
>  #include <linux/dax.h>
> @@ -143,6 +144,28 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
>  }
>  EXPORT_SYMBOL_GPL(file_ra_state_init);
>
> +static struct bpf_fs_readahead_ops *bpf_fs_ra_ops;
> +
> +int bpf_fs_ra_register(struct bpf_fs_readahead_ops *ops)
> +{
> +       if (bpf_fs_ra_ops)
> +               return -EBUSY;
> +
> +       bpf_fs_ra_ops = ops;
> +       return 0;
> +}
> +
> +void bpf_fs_ra_unregister(struct bpf_fs_readahead_ops *ops)
> +{
> +       if (bpf_fs_ra_ops == ops)
> +               bpf_fs_ra_ops = NULL;
> +}
> +
> +/*
> + * There are two ways to get here: from the filemap_read path, or from the
> + * page fault handler for memory-mapped files (filemap_fault). The first will
> + * set up readahead with ondemand_readahead, the second will do ???
> +*/
>  static void read_pages(struct readahead_control *rac)
>  {
>         const struct address_space_operations *aops = rac->mapping->a_ops;
> @@ -156,6 +179,25 @@ static void read_pages(struct readahead_control *rac)
>                 psi_memstall_enter(&rac->_pflags);
>         blk_start_plug(&plug);
>
> +       // there are two possible places to put this:
> +       // 1. In both the filemap_read and filemap_fault paths, after the call
> +       // determine readahead
> +       // 2. Here, where they both converge after determining readahead.
> +       // Not sure which is better, so I'm leaving it here for now
> +       if (bpf_fs_ra_ops) {
> +               struct bpf_fs_readahead_state state = {
> +                       .i_ino = rac->mapping->host->i_ino,
> +                       .size = rac->ra->size,
> +                       .async_size = rac->ra->async_size,
> +                       .ra_pages = rac->ra->ra_pages,
> +                       .mmap_miss = rac->ra->mmap_miss,
> +                       .prev_pos = rac->ra->prev_pos,
> +               };
> +
> +               rac->ra->ra_pages = bpf_fs_ra_ops->get_max_ra(&state);
> +               // rac->ra->size = bpf_fs_ra_ops->get_ra(&state);
> +       }
> +
>         if (aops->readahead) {
>                 aops->readahead(rac);
>                 /*
> diff --git a/mm/readahead_ops.c b/mm/readahead_ops.c
> new file mode 100644
> index 0000000000..be5de16b88
> --- /dev/null
> +++ b/mm/readahead_ops.c
> @@ -0,0 +1,112 @@
> +#include <linux/init.h>
> +#include <linux/types.h>
> +#include <linux/bpf_verifier.h>
> +#include <linux/bpf.h>
> +#include <linux/btf.h>
> +#include <linux/btf_ids.h>
> +#include <linux/filter.h>
> +
> +static int fs_readahead_get_max_ra_stub(struct bpf_fs_readahead_state *state)
> +{
> +       return 0;
> +}
> +
> +static int fs_readahead_get_ra_stub(struct bpf_fs_readahead_state *state)
> +{
> +       return 0;
> +}
> +
> +struct bpf_fs_readahead_ops knn_readahead = {
> +       .get_max_ra = fs_readahead_get_max_ra_stub,
> +       .get_ra = fs_readahead_get_ra_stub,
> +};
> +
> +// verifier
> +
> +static bool fs_readahead_ops_is_valid_access(int off, int size,
> +                                            enum bpf_access_type type,
> +                                            const struct bpf_prog *prog,
> +                                            struct bpf_insn_access_aux *info)
> +{
> +       return true;
> +}
> +
> +static int fs_readahead_btf_struct_access(struct bpf_verifier_log *log,
> +                                         const struct bpf_reg_state *reg,
> +                                         int off, int size)
> +{
> +       return 0;
> +}
> +
> +struct bpf_verifier_ops bpf_fs_readahead_verifier_ops = {
> +       .get_func_proto = NULL,
> +       .is_valid_access = fs_readahead_ops_is_valid_access,
> +       .btf_struct_access = fs_readahead_btf_struct_access,
> +};
> +
> +// management
> +
> +static int register_bpf_fs_ra(void *kdata)
> +{
> +       printk(KERN_ALERT "register_bpf_fs_ra\n");
> +       return bpf_fs_ra_register(kdata);
> +}
> +
> +static void unregister_bpf_fs_ra(void *kdata)
> +{
> +       printk(KERN_ALERT "unregister_bpf_fs_ra\n");
> +       bpf_fs_ra_unregister(kdata);
> +}
> +
> +static int bpf_fs_ra_check_member(const struct btf_type *t,
> +                                  const struct btf_member *member,
> +                                  const struct bpf_prog *prog)
> +{
> +       return 0;
> +}
> +
> +static int bpf_fs_ra_init_member(const struct btf_type *t,
> +                                const struct btf_member *member,
> +                                void *kdata, const void *udata)
> +{
> +       // no-op for now
> +       return 0;
> +}
> +
> +static struct btf *bpf_fs_ra_btf;
> +
> +static int bpf_fs_ra_init(struct btf *btf)
> +{
> +       bpf_fs_ra_btf = btf;
> +       return 0;
> +}
> +
> +static struct bpf_struct_ops bpf_fs_readahead_ops_struct = {
> +       .verifier_ops = &bpf_fs_readahead_verifier_ops,
> +       .reg = register_bpf_fs_ra,
> +       .unreg = unregister_bpf_fs_ra,
> +       .update = NULL, // cannot be updated
> +       .check_member = bpf_fs_ra_check_member,
> +       .init_member = bpf_fs_ra_init_member,
> +       .init = bpf_fs_ra_init,
> +       .validate = NULL,
> +
> +       .name = "bpf_fs_readahead_ops",
> +       .cfi_stubs = &knn_readahead,
> +       .owner = THIS_MODULE,
> +};
> +
> +static void noop(void *p)
> +{
> +       // no-op
> +}
> +
> +static int __init bpf_fs_ra_kfunc_init(void)
> +{
> +       int ret = 0;
> +       noop(&bpf_fs_readahead_ops_struct);
> +       ret = register_bpf_struct_ops(&bpf_fs_readahead_ops_struct,
> +                                         bpf_fs_readahead_ops);
> +       return ret;
> +}
> +late_initcall(bpf_fs_ra_kfunc_init);
> --
> 2.34.1
>
> And also the bpf program I’m trying to load:
>
> #include <bpf/bpf_helpers.h>
> #include <bpf/bpf_tracing.h>
> #include <linux/bpf.h>
>
> char _license[] SEC("license") = "GPL";
>
> SEC("struct_ops/bpf_iden_max_ra")
> int BPF_PROG(bpf_iden_max_ra, struct bpf_fs_readahead_state *s)
> {
>     return s->ra_pages;
> }
>
> SEC("struct_ops/bpf_iden_ra")
> int BPF_PROG(bpf_iden_ra, struct bpf_fs_readahead_state *s)
> {
>     return s->size;
> }
>
> SEC(".struct_ops")
> struct bpf_fs_readahead_ops bpf_fs_readahead_ops = {
>     .get_max_ra = (void *)bpf_iden_max_ra,
>     .get_ra = (void *)bpf_iden_ra,
> };
>
> And the verification log:
>
> + bpftool struct_ops register /root/readahead_noop.o
> libbpf: prog 'bpf_iden_max_ra': BPF program load failed: Permission denied
> libbpf: prog 'bpf_iden_max_ra': -- BEGIN PROG LOAD LOG --
> 0: R1=ctx() R10=fp0
> ; int BPF_PROG(bpf_iden_max_ra, struct bpf_fs_readahead_state *s) @ readahead_noop.c:8
> 0: (7b) *(u64 *)(r10 -8) = r1         ; R1=ctx() R10=fp0 fp-8_w=ctx()
> 1: (79) r2 = *(u64 *)(r1 +0)          ; R1=ctx() R2_w=scalar()
> 2: (85) call pc+1
> caller:
>  R10=fp0 fp-8_w=ctx()
> callee:
>  frame1: R1=ctx() R2_w=scalar() R10=fp0
> 4: frame1: R1=ctx() R2_w=scalar() R10=fp0
> 4: (7b) *(u64 *)(r10 -8) = r1         ; frame1: R1=ctx() R10=fp0 fp-8_w=ctx()
> 5: (7b) *(u64 *)(r10 -16) = r2        ; frame1: R2_w=scalar(id=1) R10=fp0 fp-16_w=scalar(id=1)
> ; return s->ra_pages; @ readahead_noop.c:10
> 6: (61) r0 = *(u32 *)(r2 +12)
> R2 invalid mem access 'scalar'
> processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
> -- END PROG LOAD LOG --
> libbpf: prog 'bpf_iden_max_ra': failed to load: -13
> libbpf: failed to load object '/root/readahead_noop.o’

Was it compiled with -O0 ?
clang didn't even inline what was marked as __always_inline which is strange.
Make sure to use the latest clang with -O2 -target bpf -g -mcpu=v3

You should see something like:

llvm-objdump -S -r bpf_cubic.bpf.o|head -50

bpf_cubic.bpf.o:    file format elf64-bpf

Disassembly of section struct_ops/bpf_cubic_init:

0000000000000000 <bpf_cubic_init>:
; void BPF_PROG(bpf_cubic_init, struct sock *sk)
       0:    79 12 00 00 00 00 00 00    r2 = *(u64 *)(r1 + 0x0)
       1:    b7 03 00 00 20 00 00 00    r3 = 0x20
        0000000000000008:  CO-RE <byte_off> [5] struct
inet_connection_sock::icsk_ca_priv (0:5)
       2:    bf 21 00 00 00 00 00 00    r1 = r2
       3:    0f 31 00 00 00 00 00 00    r1 += r3
       4:    b7 03 00 00 00 00 00 00    r3 = 0x0
;     ca->last_max_cwnd = 0;
       5:    63 31 04 00 00 00 00 00    *(u32 *)(r1 + 0x4) = r3
       6:    63 32 20 00 00 00 00 00    *(u32 *)(r2 + 0x20) = r3
        0000000000000030:  CO-RE <byte_off> [5] struct
inet_connection_sock::icsk_ca_priv (0:5)
       7:    63 31 0c 00 00 00 00 00    *(u32 *)(r1 + 0xc) = r3
       8:    63 31 08 00 00 00 00 00    *(u32 *)(r1 + 0x8) = r3





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux