Re: [PATCH v2 bpf-next 4/4] bpf: Add kernel module with user mode driver that populates bpffs.

Daniel Borkmann <daniel@xxxxxxxxxxxxx> · Thu, 23 Jul 2020 00:16:13 +0200

On 7/17/20 6:40 AM, Alexei Starovoitov wrote:
From: Alexei Starovoitov <ast@xxxxxxxxxx>

Add kernel module with user mode driver that populates bpffs with
BPF iterators.

$ mount bpffs /my/bpffs/ -t bpf
$ ls -la /my/bpffs/
total 4
drwxrwxrwt  2 root root    0 Jul  2 00:27 .
drwxr-xr-x 19 root root 4096 Jul  2 00:09 ..
-rw-------  1 root root    0 Jul  2 00:27 maps.debug
-rw-------  1 root root    0 Jul  2 00:27 progs.debug

The user mode driver will load BPF Type Formats, create BPF maps, populate BPF
maps, load two BPF programs, attach them to BPF iterators, and finally send two
bpf_link IDs back to the kernel.
The kernel will pin two bpf_links into newly mounted bpffs instance under
names "progs.debug" and "maps.debug". These two files become human readable.

$ cat /my/bpffs/progs.debug
   id name            pages attached
   11 dump_bpf_map        1 bpf_iter_bpf_map
   12 dump_bpf_prog       1 bpf_iter_bpf_prog
   27 test_pkt_access     1
   32 test_main           1 test_pkt_access test_pkt_access
   33 test_subprog1       1 test_pkt_access_subprog1 test_pkt_access
   34 test_subprog2       1 test_pkt_access_subprog2 test_pkt_access
   35 test_subprog3       1 test_pkt_access_subprog3 test_pkt_access
   36 new_get_skb_len     1 get_skb_len test_pkt_access
   37 new_get_skb_ifi     1 get_skb_ifindex test_pkt_access
   38 new_get_constan     1 get_constant test_pkt_access

The BPF program dump_bpf_prog() in iterators.bpf.c is printing this data about
all BPF programs currently loaded in the system. This information is unstable
and will change from kernel to kernel as ".debug" suffix conveys.

Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
[...]
  static int bpf_obj_do_pin(const char __user *pathname, void *raw,
  			  enum bpf_type type)
  {
@@ -638,6 +661,61 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
  	return 0;
  }
  
+struct bpf_preload_ops bpf_preload_ops = { .info.driver_name = "bpf_preload" };
+EXPORT_SYMBOL_GPL(bpf_preload_ops);
+
+static int populate_bpffs(struct dentry *parent)
+{
+	struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
+	struct bpf_link *links[BPF_PRELOAD_LINKS] = {};
+	int err = 0, i;
+
+	mutex_lock(&bpf_preload_ops.lock);
+	if (!bpf_preload_ops.do_preload) {
+		mutex_unlock(&bpf_preload_ops.lock);
+		request_module("bpf_preload");
+		mutex_lock(&bpf_preload_ops.lock);
+
+		if (!bpf_preload_ops.do_preload) {
+			pr_err("bpf_preload module is missing.\n"
+			       "bpffs will not have iterators.\n");
+			goto out;
+		}
+	}

Overall set looks good. One thing that appears to be possible from staring at
the code is that while we load the bpf_preload module here and below invoke the
modules' preload ops callbacks, there seems to be nothing that prevents the module
from being forcefully unloaded in parallel (e.g. no ref on the module held).

So it looks like the old bpfilter code was preventing exactly this through holding
bpfilter_ops.lock mutex during its {load,fini}_umh() modules init/exit functions.

Other than that, maybe it would be nice to have a test_progs selftests extension
which mounts multiple BPF fs instances, and asserts that if one of them has the
{progs,maps}.debug files that the other ones must have it as well, plus plain
reading of both (w/o parsing anything from there) just to make sure the dump
terminates .. at least to have some basic exercising of the code in there.

Thanks,
Daniel

+	if (!bpf_preload_ops.info.tgid) {
+		err = bpf_preload_ops.do_preload(objs);
+		if (err)
+			goto out;
+		for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+			links[i] = bpf_link_by_id(objs[i].link_id);
+			if (IS_ERR(links[i])) {
+				err = PTR_ERR(links[i]);
+				goto out;
+			}
+		}
+		for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+			err = bpf_iter_link_pin_kernel(parent,
+						       objs[i].link_name, links[i]);
+			if (err)
+				goto out;
+			/* do not unlink successfully pinned links even
+			 * if later link fails to pin
+			 */
+			links[i] = NULL;
+		}
+		err = bpf_preload_ops.do_finish();
+		if (err)
+			goto out;
+	}
+out:
+	mutex_unlock(&bpf_preload_ops.lock);
+	for (i = 0; i < BPF_PRELOAD_LINKS && err; i++)
+		if (!IS_ERR_OR_NULL(links[i]))
+			bpf_link_put(links[i]);
+	return err;
+}
+
  static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
  {
  	static const struct tree_descr bpf_rfiles[] = { { "" } };
@@ -654,8 +732,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
  	inode = sb->s_root->d_inode;
  	inode->i_op = &bpf_dir_iops;
  	inode->i_mode &= ~S_IALLUGO;
+	populate_bpffs(sb->s_root);
  	inode->i_mode |= S_ISVTX | opts->mode;
-
  	return 0;
  }
  
@@ -705,6 +783,8 @@ static int __init bpf_init(void)
[...]

diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
new file mode 100644
index 000000000000..cd10f291d6cd
--- /dev/null
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pid.h>
+#include <linux/fs.h>
+#include <linux/sched/signal.h>
+#include "bpf_preload.h"
+
+extern char bpf_preload_umd_start;
+extern char bpf_preload_umd_end;
+
+static int do_preload(struct bpf_preload_info *obj)
+{
+	int magic = BPF_PRELOAD_START;
+	struct pid *tgid;
+	loff_t pos = 0;
+	int i, err;
+	ssize_t n;
+
+	err = fork_usermode_driver(&bpf_preload_ops.info);
+	if (err)
+		return err;
+	tgid = bpf_preload_ops.info.tgid;
+
+	/* send the start magic to let UMD proceed with loading BPF progs */
+	n = kernel_write(bpf_preload_ops.info.pipe_to_umh,
+			 &magic, sizeof(magic), &pos);
+	if (n != sizeof(magic))
+		return -EPIPE;
+
+	/* receive bpf_link IDs and names from UMD */
+	pos = 0;
+	for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+		n = kernel_read(bpf_preload_ops.info.pipe_from_umh,
+				&obj[i], sizeof(*obj), &pos);
+		if (n != sizeof(*obj))
+			return -EPIPE;
+	}
+	return 0;
+}
+
+static int do_finish(void)
+{
+	int magic = BPF_PRELOAD_END;
+	struct pid *tgid;
+	loff_t pos = 0;
+	ssize_t n;
+
+	/* send the last magic to UMD. It will do a normal exit. */
+	n = kernel_write(bpf_preload_ops.info.pipe_to_umh,
+			 &magic, sizeof(magic), &pos);
+	if (n != sizeof(magic))
+		return -EPIPE;
+	tgid = bpf_preload_ops.info.tgid;
+	wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+	bpf_preload_ops.info.tgid = NULL;
+	return 0;
+}
+
+static int __init load_umd(void)
+{
+	int err;
+
+	err = umd_load_blob(&bpf_preload_ops.info, &bpf_preload_umd_start,
+			    &bpf_preload_umd_end - &bpf_preload_umd_start);
+	if (err)
+		return err;
+	bpf_preload_ops.do_preload = do_preload;
+	bpf_preload_ops.do_finish = do_finish;
+	return err;
+}
+
+static void __exit fini_umd(void)
+{
+	bpf_preload_ops.do_preload = NULL;
+	bpf_preload_ops.do_finish = NULL;
+	/* kill UMD in case it's still there due to earlier error */
+	kill_pid(bpf_preload_ops.info.tgid, SIGKILL, 1);
+	bpf_preload_ops.info.tgid = NULL;
+	umd_unload_blob(&bpf_preload_ops.info);
+}
[...]