struct_ops usage - verifier interactions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I’m a phd student working on adding a new struct_ops type for a potential project, and I have a few questions about its usage.

The background is that we’re thinking of delegating some kernel heuristics to bpf programs, and we’re starting with the readahead logic. The prototype right now just adds an ugly struct_ops right in the middle of the block read path (specifically in read_pages just as a test, will probably split out the read vs fault path later) as a test. The patch I have simply packages up a few readahead control variables already present into a new `bpf_fs_readahead_state` struct and passes that to the struct_ops bpf program.

The problem, however, is that the verifier rejects a program that is literally nothing but `return state->ra_pages`. Any dereference of the passed-in struct lead to a verifier rejection with `R1 invalid mem access ‘scalar’`, complaining that the passed-in pointer is invalid. The register, when passed to `is_valid_access`, is already of scalar type, so even a `return true` `is_valid_access` still leads to the rejection.

I’m confused about why the pointer is not a `PTR_TO_BTF_ID`, which is the case when looking at calls to bpf_cong’s `is_valid_access` when loading bpf_cubic.c. I’m reviewing the fuse-bpf and sched_ext patches, and I don’t see where I need to register my new state struct with the verifier, if that’s indeed what I need to do for the verifier to correctly identify the pointer in `check_mem_access`. I see references to special pointer types for tcp_sock in the case of tcp_cong, but not anything for the fuse-bpf patches.

If there’s documentation somewhere or sample code I should consult please let me know as well.

Here’s my (very ugly, just trying to get it to work) patch is below, along with the program I’m trying to load and a verification log.

---
 include/linux/bpf.h |  21 +++++++++
 mm/Makefile         |   3 ++
 mm/readahead.c      |  42 +++++++++++++++++
 mm/readahead_ops.c  | 112 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 178 insertions(+)
 create mode 100644 mm/readahead_ops.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c7aa99b44d..0e461d532c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1799,6 +1799,27 @@ struct bpf_dummy_ops {
 	int (*test_sleepable)(struct bpf_dummy_ops_state *cb);
 };
 
+// KNN: put it here for now for lack of a better place
+
+struct bpf_fs_readahead_state {
+	int i_ino;
+	unsigned int size;
+	unsigned int async_size;
+	unsigned int ra_pages;
+	unsigned int mmap_miss;
+	loff_t prev_pos;
+};
+
+struct bpf_fs_readahead_ops {
+	int (*get_max_ra)(struct bpf_fs_readahead_state *cb);
+	int (*get_ra)(struct bpf_fs_readahead_state *cb);
+};
+
+// temporarily globals as we get this working
+
+int bpf_fs_ra_register(struct bpf_fs_readahead_ops *ops);
+void bpf_fs_ra_unregister(struct bpf_fs_readahead_ops *ops);
+
 int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 			    union bpf_attr __user *uattr);
 #endif
diff --git a/mm/Makefile b/mm/Makefile
index e4b5b75aae..ccd4287286 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -134,3 +134,6 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+
+# TODO use bpf for now
+obj-$(CONFIG_BPF) += readahead_ops.o
diff --git a/mm/readahead.c b/mm/readahead.c
index 23620c57c1..a2c88fb629 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -113,6 +113,7 @@
  * ->read_folio() which may be less efficient.
  */
 
+#include "linux/bpf.h"
 #include <linux/blkdev.h>
 #include <linux/kernel.h>
 #include <linux/dax.h>
@@ -143,6 +144,28 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
 
+static struct bpf_fs_readahead_ops *bpf_fs_ra_ops;
+
+int bpf_fs_ra_register(struct bpf_fs_readahead_ops *ops)
+{
+	if (bpf_fs_ra_ops)
+		return -EBUSY;
+
+	bpf_fs_ra_ops = ops;
+	return 0;
+}
+
+void bpf_fs_ra_unregister(struct bpf_fs_readahead_ops *ops)
+{
+	if (bpf_fs_ra_ops == ops)
+		bpf_fs_ra_ops = NULL;
+}
+
+/*
+ * There are two ways to get here: from the filemap_read path, or from the
+ * page fault handler for memory-mapped files (filemap_fault). The first will 
+ * set up readahead with ondemand_readahead, the second will do ???
+*/
 static void read_pages(struct readahead_control *rac)
 {
 	const struct address_space_operations *aops = rac->mapping->a_ops;
@@ -156,6 +179,25 @@ static void read_pages(struct readahead_control *rac)
 		psi_memstall_enter(&rac->_pflags);
 	blk_start_plug(&plug);
 
+	// there are two possible places to put this:
+	// 1. In both the filemap_read and filemap_fault paths, after the call
+	// determine readahead
+	// 2. Here, where they both converge after determining readahead.
+	// Not sure which is better, so I'm leaving it here for now
+	if (bpf_fs_ra_ops) {
+		struct bpf_fs_readahead_state state = {
+			.i_ino = rac->mapping->host->i_ino,
+			.size = rac->ra->size,
+			.async_size = rac->ra->async_size,
+			.ra_pages = rac->ra->ra_pages,
+			.mmap_miss = rac->ra->mmap_miss,
+			.prev_pos = rac->ra->prev_pos,
+		};
+
+		rac->ra->ra_pages = bpf_fs_ra_ops->get_max_ra(&state);
+		// rac->ra->size = bpf_fs_ra_ops->get_ra(&state);
+	}
+
 	if (aops->readahead) {
 		aops->readahead(rac);
 		/*
diff --git a/mm/readahead_ops.c b/mm/readahead_ops.c
new file mode 100644
index 0000000000..be5de16b88
--- /dev/null
+++ b/mm/readahead_ops.c
@@ -0,0 +1,112 @@
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+
+static int fs_readahead_get_max_ra_stub(struct bpf_fs_readahead_state *state)
+{
+	return 0;
+}
+
+static int fs_readahead_get_ra_stub(struct bpf_fs_readahead_state *state)
+{
+	return 0;
+}
+
+struct bpf_fs_readahead_ops knn_readahead = {
+	.get_max_ra = fs_readahead_get_max_ra_stub,
+	.get_ra = fs_readahead_get_ra_stub,
+};
+
+// verifier
+
+static bool fs_readahead_ops_is_valid_access(int off, int size,
+					     enum bpf_access_type type,
+					     const struct bpf_prog *prog,
+					     struct bpf_insn_access_aux *info)
+{
+	return true;
+}
+
+static int fs_readahead_btf_struct_access(struct bpf_verifier_log *log,
+					  const struct bpf_reg_state *reg,
+					  int off, int size)
+{
+	return 0;
+}
+
+struct bpf_verifier_ops bpf_fs_readahead_verifier_ops = {
+	.get_func_proto = NULL,
+	.is_valid_access = fs_readahead_ops_is_valid_access,
+	.btf_struct_access = fs_readahead_btf_struct_access,
+};
+
+// management
+
+static int register_bpf_fs_ra(void *kdata)
+{
+	printk(KERN_ALERT "register_bpf_fs_ra\n");
+	return bpf_fs_ra_register(kdata);
+}
+
+static void unregister_bpf_fs_ra(void *kdata)
+{
+	printk(KERN_ALERT "unregister_bpf_fs_ra\n");
+	bpf_fs_ra_unregister(kdata);
+}
+
+static int bpf_fs_ra_check_member(const struct btf_type *t,
+				   const struct btf_member *member,
+				   const struct bpf_prog *prog)
+{
+	return 0;
+}
+
+static int bpf_fs_ra_init_member(const struct btf_type *t,
+				 const struct btf_member *member,
+				 void *kdata, const void *udata)
+{
+	// no-op for now
+	return 0;
+}
+
+static struct btf *bpf_fs_ra_btf;
+
+static int bpf_fs_ra_init(struct btf *btf)
+{
+	bpf_fs_ra_btf = btf;
+	return 0;
+}
+
+static struct bpf_struct_ops bpf_fs_readahead_ops_struct = {
+	.verifier_ops = &bpf_fs_readahead_verifier_ops,
+	.reg = register_bpf_fs_ra,
+	.unreg = unregister_bpf_fs_ra,
+	.update = NULL, // cannot be updated
+	.check_member = bpf_fs_ra_check_member,
+	.init_member = bpf_fs_ra_init_member,
+	.init = bpf_fs_ra_init,
+	.validate = NULL,
+
+	.name = "bpf_fs_readahead_ops",
+	.cfi_stubs = &knn_readahead,
+	.owner = THIS_MODULE,
+};
+
+static void noop(void *p)
+{
+	// no-op
+}
+
+static int __init bpf_fs_ra_kfunc_init(void)
+{
+	int ret = 0;
+	noop(&bpf_fs_readahead_ops_struct);
+	ret = register_bpf_struct_ops(&bpf_fs_readahead_ops_struct,
+					  bpf_fs_readahead_ops);
+	return ret;
+}
+late_initcall(bpf_fs_ra_kfunc_init);
-- 
2.34.1

And also the bpf program I’m trying to load:

#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <linux/bpf.h>

char _license[] SEC("license") = "GPL";

SEC("struct_ops/bpf_iden_max_ra")
int BPF_PROG(bpf_iden_max_ra, struct bpf_fs_readahead_state *s)
{
    return s->ra_pages;
}

SEC("struct_ops/bpf_iden_ra")
int BPF_PROG(bpf_iden_ra, struct bpf_fs_readahead_state *s)
{
    return s->size;
}

SEC(".struct_ops")
struct bpf_fs_readahead_ops bpf_fs_readahead_ops = {
    .get_max_ra = (void *)bpf_iden_max_ra,
    .get_ra = (void *)bpf_iden_ra,
};

And the verification log:

+ bpftool struct_ops register /root/readahead_noop.o
libbpf: prog 'bpf_iden_max_ra': BPF program load failed: Permission denied
libbpf: prog 'bpf_iden_max_ra': -- BEGIN PROG LOAD LOG --
0: R1=ctx() R10=fp0
; int BPF_PROG(bpf_iden_max_ra, struct bpf_fs_readahead_state *s) @ readahead_noop.c:8
0: (7b) *(u64 *)(r10 -8) = r1         ; R1=ctx() R10=fp0 fp-8_w=ctx()
1: (79) r2 = *(u64 *)(r1 +0)          ; R1=ctx() R2_w=scalar()
2: (85) call pc+1
caller:
 R10=fp0 fp-8_w=ctx()
callee:
 frame1: R1=ctx() R2_w=scalar() R10=fp0
4: frame1: R1=ctx() R2_w=scalar() R10=fp0
4: (7b) *(u64 *)(r10 -8) = r1         ; frame1: R1=ctx() R10=fp0 fp-8_w=ctx()
5: (7b) *(u64 *)(r10 -16) = r2        ; frame1: R2_w=scalar(id=1) R10=fp0 fp-16_w=scalar(id=1)
; return s->ra_pages; @ readahead_noop.c:10
6: (61) r0 = *(u32 *)(r2 +12)
R2 invalid mem access 'scalar'
processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
-- END PROG LOAD LOG --
libbpf: prog 'bpf_iden_max_ra': failed to load: -13
libbpf: failed to load object '/root/readahead_noop.o’

Also available on GitHub at https://github.com/Khoury-srg/KernelNN.

Thanks,
Isaac Khor






[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux