Re: [PATCH v2 bpf-next] bpf: add lookup_and_delete_elem support to hashtab

Yonghong Song <yhs@xxxxxx> · Mon, 1 Mar 2021 18:02:37 -0800

On 2/28/21 1:40 AM, Denis Salopek wrote:
Extend the existing bpf_map_lookup_and_delete_elem() functionality to
hashtab maps, in addition to stacks and queues.
Create a new hashtab bpf_map_ops function that does lookup and deletion
of the element under the same bucket lock and add the created map_ops to
bpf.h.
Add the appropriate test cases to 'maps' and 'lru_map' selftests
accompanied with new test_progs.

Cc: Juraj Vijtiuk <juraj.vijtiuk@xxxxxxxxxx>
Cc: Luka Oreskovic <luka.oreskovic@xxxxxxxxxx>
Cc: Luka Perkov <luka.perkov@xxxxxxxxxx>
Signed-off-by: Denis Salopek <denis.salopek@xxxxxxxxxx>
---
v2: Add functionality for LRU/per-CPU, add test_progs tests.
---
  include/linux/bpf.h                           |   2 +
  kernel/bpf/hashtab.c                          |  80 +++++
  kernel/bpf/syscall.c                          |  14 +-
  .../bpf/prog_tests/lookup_and_delete.c        | 283 ++++++++++++++++++
  .../bpf/progs/test_lookup_and_delete.c        |  26 ++
  tools/testing/selftests/bpf/test_lru_map.c    |   8 +
  tools/testing/selftests/bpf/test_maps.c       |  19 +-
  7 files changed, 430 insertions(+), 2 deletions(-)
  create mode 100644 tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c
  create mode 100644 tools/testing/selftests/bpf/progs/test_lookup_and_delete.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4c730863fa77..0bcc4f89af40 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -67,6 +67,8 @@ struct bpf_map_ops {
  	void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
  	int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
  				union bpf_attr __user *uattr);
+	int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key,
+					  void *value);
  	int (*map_lookup_and_delete_batch)(struct bpf_map *map,
  					   const union bpf_attr *attr,
  					   union bpf_attr __user *uattr);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 330d721dd2af..8c3334d1b6b3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1401,6 +1401,82 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
  	rcu_read_unlock();
  }
  
+static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+					     void *value, bool is_lru_map,
+					     bool is_percpu)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	u32 hash, key_size, value_size;
+	struct hlist_nulls_head *head;
+	int cpu, off = 0, ret;
+	struct htab_elem *l;
+	unsigned long flags;
+	void __percpu *pptr;
+	struct bucket *b;
+
+	key_size = map->key_size;
+	value_size = round_up(map->value_size, 8);
+
+	hash = htab_map_hash(key, key_size, htab->hashrnd);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	ret = htab_lock_bucket(htab, b, hash, &flags);
+	if (ret)
+		return ret;
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+	if (l) {
+		if (is_percpu) {
+			pptr = htab_elem_get_ptr(l, key_size);
+			for_each_possible_cpu(cpu) {
+				bpf_long_memcpy(value + off,
+						per_cpu_ptr(pptr, cpu),
+						value_size);
+				off += value_size;
+			}
+		} else {
+			copy_map_value(map, value, l->key + round_up(key_size, 8));

For hashtab lookup elem, BPF_F_LOCK flag may be set by user, I think 
hashtab lookup_and_delete_elem should also support this flag, so user
can ensure they always get a lock protected sane value.

We have the following libbpf APIs.

LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value);
LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void 
*value,
                                         __u64 flags);
LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key,
                                              void *value);

Previously, bpf_map_lookup_and_delete_elem only supports queue/stack,
which does not need flags as it does not support BPF_F_LOCK so we
are fine.

Maybe similar to bpf_map_lookup_elem_flags() we add a
bpf_map_lookup_and_delete_elem_flags()? Maybe libbpf v1.0
can consolidate into a better uniform api.

+		}
+
+		hlist_nulls_del_rcu(&l->hash_node);
+		if (!is_lru_map)
+			free_htab_elem(htab, l);
+	} else
+		ret = -ENOENT;
+
+	htab_unlock_bucket(htab, b, hash, flags);
+
+	if (is_lru_map && l)
+		bpf_lru_push_free(&htab->lru, &l->lru_node);
+
+	return ret;
+}
+
+static int htab_map_lookup_and_delete_elem(struct bpf_map *map,
+					   void *key, void *value)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, false, false);
+}
+
+static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+						  void *key, void *value)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, false, true);
+}
+
+static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+					       void *value)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, true, false);
+}
+
+static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+						      void *key, void *value)
+{
+	return __htab_map_lookup_and_delete_elem(map, key, value, true, true);
+}
+
  static int
  __htab_map_lookup_and_delete_batch(struct bpf_map *map,
  				   const union bpf_attr *attr,
@@ -1934,6 +2010,7 @@ const struct bpf_map_ops htab_map_ops = {
  	.map_free = htab_map_free,
  	.map_get_next_key = htab_map_get_next_key,
  	.map_lookup_elem = htab_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
  	.map_update_elem = htab_map_update_elem,
  	.map_delete_elem = htab_map_delete_elem,
  	.map_gen_lookup = htab_map_gen_lookup,
@@ -1954,6 +2031,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
  	.map_free = htab_map_free,
  	.map_get_next_key = htab_map_get_next_key,
  	.map_lookup_elem = htab_lru_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
  	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
  	.map_update_elem = htab_lru_map_update_elem,
  	.map_delete_elem = htab_lru_map_delete_elem,
@@ -2077,6 +2155,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
  	.map_free = htab_map_free,
  	.map_get_next_key = htab_map_get_next_key,
  	.map_lookup_elem = htab_percpu_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
  	.map_update_elem = htab_percpu_map_update_elem,
  	.map_delete_elem = htab_map_delete_elem,
  	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
@@ -2096,6 +2175,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
  	.map_free = htab_map_free,
  	.map_get_next_key = htab_map_get_next_key,
  	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
  	.map_update_elem = htab_lru_percpu_map_update_elem,
  	.map_delete_elem = htab_lru_map_delete_elem,
  	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c859bc46d06c..2634aa4a2f37 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1495,7 +1495,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
  		goto err_put;
  	}
  
-	value_size = map->value_size;
+	value_size = bpf_map_value_size(map);
  
  	err = -ENOMEM;
  	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -1505,6 +1505,18 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
  	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
  	    map->map_type == BPF_MAP_TYPE_STACK) {
  		err = map->ops->map_pop_elem(map, value);
+	} else if (map->map_type == BPF_MAP_TYPE_HASH ||
+		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		if (!bpf_map_is_dev_bound(map)) {
+			bpf_disable_instrumentation();
+			rcu_read_lock();
+			err = map->ops->map_lookup_and_delete_elem(map, key, value);
+			rcu_read_unlock();
+			bpf_enable_instrumentation();
+			maybe_wait_bpf_programs(map);

maybe_wait_bpf_programs(map) is mostly for map-in-map.
but I think it is okay to put it here in case in the future
we will support map-in-map here. If maybe_wait_bpf_programs()
get inlined which mostly likely is the case, the compiler
should be able to optimize it away.

+		}
  	} else {
  		err = -ENOTSUPP;
  	}
diff --git a/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c
new file mode 100644
index 000000000000..05123bbcdc1c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/lookup_and_delete.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <test_progs.h>
+#include "test_lookup_and_delete.skel.h"
+
+#define START_VALUE 1234
+#define NEW_VALUE 4321
+#define MAX_ENTRIES 2
+
+static int duration;
+static int nr_cpus;
+
+static int fill_values(int map_fd)
+{
+	__u64 key, value = START_VALUE;
+	int err;
+
+	for (key = 1; key < MAX_ENTRIES + 1; key++) {
+		err = bpf_map_update_elem(map_fd, &key, &value, BPF_NOEXIST);
+		if (CHECK(err, "bpf_map_update_elem", "failed\n"))

You can use
	if (!ASSERT_OK(err, "bpf_map_update_elem"))
to save you from explicit "failed" string.
The same for some later other CHECK usages.

+			return -1;
+	}
+
+	return 0;
+}
+
+static int fill_values_percpu(int map_fd)
+{
+	BPF_DECLARE_PERCPU(__u64, value);
+	int i, err;
+	u64 key;
+
+	for (i = 0; i < nr_cpus; i++)
+		bpf_percpu(value, i) = START_VALUE;
+
+	for (key = 1; key < MAX_ENTRIES + 1; key++) {
+		err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST);
+		if (CHECK(err, "bpf_map_update_elem", "failed\n"))
+			return -1;
+	}
+
+	return 0;
+}
+
[...]
diff --git a/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c
new file mode 100644
index 000000000000..eb19de8bb415
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_lookup_and_delete.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+__u32 set_pid;
+__u64 set_key;
+__u64 set_value;

Please add "= 0" to the above declaration to make
it llvm10 friendly.

+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 2);
+	__type(key, __u64);
+	__type(value, __u64);
+} hash_map SEC(".maps");
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int bpf_lookup_and_delete_test(const void *ctx)
+{
+	if (set_pid == bpf_get_current_pid_tgid() >> 32)
+		bpf_map_update_elem(&hash_map, &set_key, &set_value, BPF_NOEXIST);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
[...]