Re: [PATCH bpf-next v1 03/19] bpf: add bpf_map iterator

Yonghong Song <yhs@xxxxxx> · Tue, 28 Apr 2020 18:02:40 -0700

On 4/28/20 5:37 PM, Martin KaFai Lau wrote:
On Mon, Apr 27, 2020 at 01:12:37PM -0700, Yonghong Song wrote:
The bpf_map iterator is implemented.
The bpf program is called at seq_ops show() and stop() functions.
bpf_iter_get_prog() will retrieve bpf program and other
parameters during seq_file object traversal. In show() function,
bpf program will traverse every valid object, and in stop()
function, bpf program will be called one more time after all
objects are traversed.

The first member of the bpf context contains the meta data, namely,
the seq_file, session_id and seq_num. Here, the session_id is
a unique id for one specific seq_file session. The seq_num is
the number of bpf prog invocations in the current session.
The bpf_iter_get_prog(), which will be implemented in subsequent
patches, will have more information on how meta data are computed.

The second member of the bpf context is a struct bpf_map pointer,
which bpf program can examine.

The target implementation also provided the structure definition
for bpf program and the function definition for verifier to
verify the bpf program. Specifically for bpf_map iterator,
the structure is "bpf_iter__bpf_map" andd the function is
"__bpf_iter__bpf_map".

More targets will be implemented later, all of which will include
the following, similar to bpf_map iterator:
   - seq_ops() implementation
   - function definition for verifier to verify the bpf program
   - seq_file private data size
   - additional target feature

Signed-off-by: Yonghong Song <yhs@xxxxxx>
---
  include/linux/bpf.h   |  10 ++++
  kernel/bpf/Makefile   |   2 +-
  kernel/bpf/bpf_iter.c |  19 ++++++++
  kernel/bpf/map_iter.c | 107 ++++++++++++++++++++++++++++++++++++++++++
  kernel/bpf/syscall.c  |  13 +++++
  5 files changed, 150 insertions(+), 1 deletion(-)
  create mode 100644 kernel/bpf/map_iter.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5e56abc1e2f1..4ac8d61f7c3e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1078,6 +1078,7 @@ int  generic_map_update_batch(struct bpf_map *map,
  int  generic_map_delete_batch(struct bpf_map *map,
  			      const union bpf_attr *attr,
  			      union bpf_attr __user *uattr);
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
  
  extern int sysctl_unprivileged_bpf_disabled;
  
@@ -1118,7 +1119,16 @@ struct bpf_iter_reg {
  	u32 target_feature;
  };
  
+struct bpf_iter_meta {
+	__bpf_md_ptr(struct seq_file *, seq);
+	u64 session_id;
+	u64 seq_num;
+};
+
  int bpf_iter_reg_target(struct bpf_iter_reg *reg_info);
+struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
+				   u64 *session_id, u64 *seq_num, bool is_last);
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
  
  int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
  int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a8b0febd3f6..b2b5eefc5254 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
  obj-y := core.o
  CFLAGS_core.o += $(call cc-disable-warning, override-init)
  
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
  obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
  obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 1115b978607a..284c95587803 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -48,3 +48,22 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info)
  
  	return 0;
  }
+
+struct bpf_prog *bpf_iter_get_prog(struct seq_file *seq, u32 priv_data_size,
+				   u64 *session_id, u64 *seq_num, bool is_last)
+{
+	return NULL;
Can this patch be moved after this function is implemented?

I tried to have an example on how regristration looks like,
so I put bpf_map iterator implementation patch immediately
after the bpf_iter_reg_target() patch. Unfortunately, I make
the iterator implementation complete and compiler can pass,
I need this function() to be implemented in the above.

I guess I can delay this patch until I can properly
implement it, just like my RFC v2.


+}
+
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
+{
+	int ret;
+
+	migrate_disable();
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+	migrate_enable();
+
+	return ret;
+}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
new file mode 100644
index 000000000000..bb3ad4c3bde5
--- /dev/null
+++ b/kernel/bpf/map_iter.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+
+struct bpf_iter_seq_map_info {
+	struct bpf_map *map;
+	u32 id;
+};
+
+static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+	struct bpf_map *map;
+	u32 id = info->id;
+
+	map = bpf_map_get_curr_or_next(&id);
+	if (IS_ERR_OR_NULL(map))
+		return NULL;
+
+	++*pos;
Does pos always need to be incremented here?

Yes, I skipped passing SEQ_START_TOKEN to show(). Put it another way,
bpf program won't be called for SEQ_START_TOKEN, so I did a shortcut here.


+	info->map = map;
+	info->id = id;
+	return map;
+}
+
+static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+	struct bpf_map *map;
+
+	++*pos;
+	++info->id;
+	map = bpf_map_get_curr_or_next(&info->id);
+	if (IS_ERR_OR_NULL(map))
+		return NULL;
+
+	bpf_map_put(info->map);
+	info->map = map;
+	return map;
+}
+
+struct bpf_iter__bpf_map {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_map *, map);
+};
+
+int __init __bpf_iter__bpf_map(struct bpf_iter_meta *meta, struct bpf_map *map)
+{
+	return 0;
+}
+
+static int bpf_map_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_iter__bpf_map ctx;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	ctx.meta = &meta;
+	ctx.map = v;
+	meta.seq = seq;
+	prog = bpf_iter_get_prog(seq, sizeof(struct bpf_iter_seq_map_info),
+				 &meta.session_id, &meta.seq_num,
+				 v == (void *)0);
 From looking at seq_file.c, when will show() be called with "v == NULL"?

In the stop() function.


+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	return ret == 0 ? 0 : -EINVAL;
The verifier change in patch 4 should have ensured that prog
can only return 0?

Yes. I forgot to update this after last minutes I added verifier
enforcement. I can do
	if (prog)
		bpf_iter_run_prog(prog, &ctx);

	return 0;


+}
+
+static void bpf_map_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_map_info *info = seq->private;
+
+	if (!v)
+		bpf_map_seq_show(seq, v);

bpf program for NULL object is called here.

+
+	if (info->map) {
+		bpf_map_put(info->map);
+		info->map = NULL;
+	}
+}
+
+static const struct seq_operations bpf_map_seq_ops = {
+	.start	= bpf_map_seq_start,
+	.next	= bpf_map_seq_next,
+	.stop	= bpf_map_seq_stop,
+	.show	= bpf_map_seq_show,
+};
+
+static int __init bpf_map_iter_init(void)
+{
+	struct bpf_iter_reg reg_info = {
+		.target			= "bpf_map",
+		.target_func_name	= "__bpf_iter__bpf_map",
+		.seq_ops		= &bpf_map_seq_ops,
+		.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
+		.target_feature		= 0,
+	};
+
+	return bpf_iter_reg_target(&reg_info);
+}
+
+late_initcall(bpf_map_iter_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7626b8024471..022187640943 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2800,6 +2800,19 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
  	return err;
  }
  
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
+{
+	struct bpf_map *map;
+
+	spin_lock_bh(&map_idr_lock);
+	map = idr_get_next(&map_idr, id);
+	if (map)
+		map = __bpf_map_inc_not_zero(map, false);
nit. For the !map case, set "map = ERR_PTR(-ENOENT)" so that
the _OR_NULL() test is not needed.  It will be more consistent
with other error checking codes in syscall.c.

Good point, will do that.


+	spin_unlock_bh(&map_idr_lock);
+
+	return map;
+}
+
  #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
  
  struct bpf_prog *bpf_prog_by_id(u32 id)
--
2.24.1