On 4/29/24 2:36 PM, Kui-Feng Lee wrote:
+/* Called from the subsystem that consume the struct_ops. + * + * The caller should protected this function by holding rcu_read_lock() to + * ensure "data" is valid. However, this function may unlock rcu + * temporarily. The caller should not rely on the preceding rcu_read_lock() + * after returning from this function.
This temporarily losing rcu_read_lock protection is error prone. The caller should do the inc_not_zero() instead if it is needed.
I feel the approach in patch 1 and 3 is a little box-ed in by the earlier tcp-cc usage that tried to fit into the kernel module reg/unreg paradigm and hide as much bpf details as possible from tcp-cc. This is not necessarily true now for other subsystem which has bpf struct_ops from day one.
The epoll detach notification is link only. Can this kernel side specific unreg be limited to struct_ops link only? During reg, a rcu protected link could be passed to the subsystem. That subsystem becomes a kernel user of the bpf link and it can call link_detach(link) to detach. Pseudo code:
struct link __rcu *link; rcu_read_lock(); ref_link = rcu_dereference(link) if (ref_link) ref_link = bpf_link_inc_not_zero(ref_link); rcu_read_unlock(); if (!IS_ERR_OR_NULL(ref_link)) { bpf_struct_ops_map_link_detach(ref_link); bpf_link_put(ref_link); }
+ * + * Return true if unreg() success. If a call fails, it means some other + * task has unrgistered or is unregistering the same object. + */ +bool bpf_struct_ops_kvalue_unreg(void *data) +{ + struct bpf_struct_ops_map *st_map = + container_of(data, struct bpf_struct_ops_map, kvalue.data); + enum bpf_struct_ops_state prev_state; + struct bpf_struct_ops_link *st_link; + bool ret = false; + + /* The st_map and st_link should be protected by rcu_read_lock(), + * or they may have been free when we try to increase their + * refcount. + */ + if (IS_ERR(bpf_map_inc_not_zero(&st_map->map))) + /* The map is already gone */ + return false; + + prev_state = cmpxchg(&st_map->kvalue.common.state, + BPF_STRUCT_OPS_STATE_INUSE, + BPF_STRUCT_OPS_STATE_TOBEFREE); + if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + st_map->st_ops_desc->st_ops->unreg(data); + /* Pair with bpf_map_inc() for reg() */ + bpf_map_put(&st_map->map); + /* Pair with bpf_map_inc_not_zero() above */ + bpf_map_put(&st_map->map); + return true; + } + if (prev_state != BPF_STRUCT_OPS_STATE_READY) + goto fail; + + /* With BPF_F_LINK */ + + st_link = rcu_dereference(st_map->attached); + if (!st_link || !bpf_link_inc_not_zero(&st_link->link)) + /* The map is on the way to unregister */ + goto fail; + + rcu_read_unlock(); + mutex_lock(&update_mutex); + + if (rcu_dereference_protected(st_link->map, true) != &st_map->map) + /* The map should be unregistered already or on the way to + * be unregistered. + */ + goto fail_unlock; + + st_map->st_ops_desc->st_ops->unreg(data); + + map_attached_null(st_map); + rcu_assign_pointer(st_link->map, NULL); + /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or + * bpf_map_inc() in bpf_struct_ops_map_link_update(). + */ + bpf_map_put(&st_map->map); + + ret = true; + +fail_unlock: + mutex_unlock(&update_mutex); + rcu_read_lock(); + bpf_link_put(&st_link->link); +fail: + bpf_map_put(&st_map->map); + return ret; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_kvalue_unreg);