On 11/18/23 5:04 AM, Hou Tao wrote:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ec3c90202ffe6..8faa1af4b39df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -245,6 +245,12 @@ struct bpf_list_node_kern {
void *owner;
} __attribute__((aligned(8)));
+enum {
+ BPF_MAP_ACC_NORMAL_PROG_CTX = 1,
+ BPF_MAP_ACC_SLEEPABLE_PROG_CTX = 2,
nit. It is a bit flag. Use (1U << 0) and (1U << 1) to make it more
obvious.
Will fix.
How about renaming this to BPF_MAP_RCU_GP and BPF_MAP_RCU_TT_GP to
better reflect it is used to decide if the map_free needs to wait for
any rcu gp?
OK. It is fine with me.
+ BPF_MAP_ACC_PROG_CTX_MASK = BPF_MAP_ACC_NORMAL_PROG_CTX |
BPF_MAP_ACC_SLEEPABLE_PROG_CTX,
+};
+
struct bpf_map {
/* The first two cachelines with read-mostly members of which some
* are also accessed in fast-path (e.g. ops, max_entries).
@@ -292,7 +298,8 @@ struct bpf_map {
} owner;
bool bypass_spec_v1;
bool frozen; /* write-once; write-protected by freeze_mutex */
- bool free_after_mult_rcu_gp;
+ atomic_t owned_prog_ctx;
Instead of the enum flags, this should only need a true/false value to
tell whether the outer map has ever been used by a sleepable prog or
not. may be renaming this to "atomic used_by_sleepable;"?
I have considered about that. But there is maps which is only accessed
in userspace (e.g. map used in BPF_PROG_BIND_MAP or maps in test_maps),
so I though one bool is not enough.
Good point.
In this case, a nit on the name. The "prog_ctx" part in the owned_prog_ctx
usually means a different thing, like the "struct __sk_buff". How about using
this pair of names?
/* may be just "long" and then set_bit? */
atomic_t used_in_rcu_gp;
atomic_t free_by_rcu_gp;
shortened the name by removing the "_flags" suggested in the earlier comment.
+ atomic_t may_be_accessed_prog_ctx;
nit. rename this to "rcu_gp_flags;"
Will do.
s64 __percpu *elem_count;
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index cf33630655661..e3d26a89ac5b6 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -131,12 +131,20 @@ void bpf_map_fd_put_ptr(struct bpf_map *map,
void *ptr, bool deferred)
{
struct bpf_map *inner_map = ptr;
- /* The inner map may still be used by both non-sleepable and
sleepable
- * bpf program, so free it after one RCU grace period and one tasks
- * trace RCU grace period.
+ /* Defer the freeing of inner map according to the owner program
+ * context of outer maps, so unnecessary multiple RCU GP waitings
+ * can be avoided.
*/
- if (deferred)
- WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+ if (deferred) {
+ /* owned_prog_ctx may be updated concurrently by new bpf
program
+ * so add smp_mb() below to ensure that reading owned_prog_ctx
+ * will return the newly-set bit when the new bpf program finds
+ * the inner map before it is removed from outer map.
+ */
+ smp_mb();
This part took my head spinning a little, so it is better to ask. The
owned_prog_ctx is set during verification time. There are many
instructions till the prog is actually verified, attached (another
syscall) and then run to do the actual lookup(&outer_map). Is this
level of reordering practically possible?
Er, I added the memory barrier due to uncertainty. According to [1],
My immediate thought was a more straight forward spin lock instead.
Agree that smp_mb() works as well, ok.
syscall doesn't imply a memory barrier. And If I understand correctly,
when the program is loaded and attached through bpf_sys_bpf, there will
be no new syscall. >
[1]:
https://www.kernel.org/doc/html/latest/core-api/wrappers/memory-barriers.html
+ atomic_or(atomic_read(&map->owned_prog_ctx),
+ &inner_map->may_be_accessed_prog_ctx);
+ }
bpf_map_put(inner_map);
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e2d2701ce2c45..5a7906f2b027e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -694,12 +694,20 @@ static void bpf_map_free_deferred(struct
work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
struct btf_record *rec = map->record;
+ int acc_ctx;
security_bpf_map_free(map);
bpf_map_release_memcg(map);
- if (READ_ONCE(map->free_after_mult_rcu_gp))
- synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
+ acc_ctx = atomic_read(&map->may_be_accessed_prog_ctx) &
BPF_MAP_ACC_PROG_CTX_MASK;
The mask should not be needed.
Yep. Will remove it.
+ if (acc_ctx) {
+ if (acc_ctx == BPF_MAP_ACC_NORMAL_PROG_CTX)
+ synchronize_rcu();
+ else if (acc_ctx == BPF_MAP_ACC_SLEEPABLE_PROG_CTX)
+ synchronize_rcu_tasks_trace();
+ else
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
Is it better to add a rcu_head to the map and then use call_rcu_().
e.g. when there is many delete happened to the outer map during a
process restart to re-populate the outer map. It is relatively much
cheaper to add a rcu_head to the map comparing to adding one for each
elem. wdyt?
Good idea. call_rcu() will be much cheaper than synchronize_rcu(). Will do.
+ }
/* implementation dependent freeing */
map->ops->map_free(map);
@@ -5326,6 +5334,9 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
goto out_unlock;
}
+ /* No need to update owned_prog_ctx, because the bpf program
doesn't
+ * access the map.
+ */
memcpy(used_maps_new, used_maps_old,
sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
used_maps_new[prog->aux->used_map_cnt] = map;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bd1c42eb540f1..d8d5432b240dc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18012,12 +18012,17 @@ static int resolve_pseudo_ldimm64(struct
bpf_verifier_env *env)
return -E2BIG;
}
+ atomic_or(env->prog->aux->sleepable ?
BPF_MAP_ACC_SLEEPABLE_PROG_CTX :
+ BPF_MAP_ACC_NORMAL_PROG_CTX,
+ &map->owned_prog_ctx);
This is setting a bit. Would set_bit() work as good?
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
* and all maps are released in free_used_maps()
*/
bpf_map_inc(map);
+ /* Paired with smp_mb() in bpf_map_fd_put_ptr() */
+ smp_mb__after_atomic();
To make it clearer, can this be moved immediately after the set_bit / atomic_or
above?
aux->map_index = env->used_map_cnt;
env->used_maps[env->used_map_cnt++] = map;