On Sat, 2023-10-21 at 22:03 -0700, thinker.li@xxxxxxxxx wrote: > From: Kui-Feng Lee <thinker.li@xxxxxxxxx> > > To ensure that a module remains accessible whenever a struct_ops object of > a struct_ops type provided by the module is still in use. > > Signed-off-by: Kui-Feng Lee <thinker.li@xxxxxxxxx> > --- > include/linux/bpf.h | 1 + > include/linux/btf.h | 2 +- > kernel/bpf/bpf_struct_ops.c | 70 ++++++++++++++++++++++++++++++------- > 3 files changed, 60 insertions(+), 13 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 4f3b67932ded..26feb8a2da4f 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1626,6 +1626,7 @@ struct bpf_struct_ops { > void (*unreg)(void *kdata); > int (*update)(void *kdata, void *old_kdata); > int (*validate)(void *kdata); > + struct module *owner; > const char *name; > struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; > }; > diff --git a/include/linux/btf.h b/include/linux/btf.h > index 8e37f7eb02c7..6a64b372b7a0 100644 > --- a/include/linux/btf.h > +++ b/include/linux/btf.h > @@ -575,7 +575,7 @@ struct bpf_struct_ops; > struct bpf_struct_ops_desc; > > struct bpf_struct_ops_desc * > -btf_add_struct_ops(struct bpf_struct_ops *st_ops); > +btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops); > const struct bpf_struct_ops_desc * > btf_get_struct_ops(struct btf *btf, u32 *ret_cnt); > > diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c > index 0bc21a39257d..413a3f8b26ba 100644 > --- a/kernel/bpf/bpf_struct_ops.c > +++ b/kernel/bpf/bpf_struct_ops.c > @@ -388,6 +388,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > const struct btf_member *member; > const struct btf_type *t = st_ops_desc->type; > struct bpf_tramp_links *tlinks; > + struct module *mod = NULL; > void *udata, *kdata; > int prog_fd, err; > void *image, *image_end; > @@ -425,6 +426,14 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > goto unlock; > } > > + if (st_ops_desc->btf != btf_vmlinux) { > + mod = btf_try_get_module(st_ops_desc->btf); > + if (!mod) { > + err = -EBUSY; Nit: there is a disagreement about error code returned for failing btf_try_get_module() across verifier code base: - EINVAL is used 2 times; - ENXIO is used 3 times; - ENOTSUPP is used once. Are you sure EBUSY is a good choice here? > + goto unlock; > + } > + } > + > memcpy(uvalue, value, map->value_size); > > udata = &uvalue->data; > @@ -552,6 +561,10 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. > */ > smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE); > + /* Hold the owner module until the struct_ops is > + * unregistered > + */ > + mod = NULL; > goto unlock; > } > > @@ -568,6 +581,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > memset(uvalue, 0, map->value_size); > memset(kvalue, 0, map->value_size); > unlock: > + module_put(mod); > kfree(tlinks); > mutex_unlock(&st_map->lock); > return err; > @@ -588,6 +602,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) > switch (prev_state) { > case BPF_STRUCT_OPS_STATE_INUSE: > st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); > + module_put(st_map->st_ops_desc->st_ops->owner); > bpf_map_put(map); > return 0; > case BPF_STRUCT_OPS_STATE_TOBEFREE: > @@ -674,6 +689,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) > size_t st_map_size; > struct bpf_struct_ops_map *st_map; > const struct btf_type *t, *vt; > + struct module *mod = NULL; > struct bpf_map *map; > int ret; > > @@ -681,9 +697,17 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) > if (!st_ops_desc) > return ERR_PTR(-ENOTSUPP); > > + if (st_ops_desc->btf != btf_vmlinux) { > + mod = btf_try_get_module(st_ops_desc->btf); > + if (!mod) > + return ERR_PTR(-EINVAL); > + } > + > vt = st_ops_desc->value_type; > - if (attr->value_size != vt->size) > - return ERR_PTR(-EINVAL); > + if (attr->value_size != vt->size) { > + ret = -EINVAL; > + goto errout; > + } > > t = st_ops_desc->type; > > @@ -694,17 +718,17 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) > (vt->size - sizeof(struct bpf_struct_ops_value)); > > st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); > - if (!st_map) > - return ERR_PTR(-ENOMEM); > + if (!st_map) { > + ret = -ENOMEM; > + goto errout; > + } > > st_map->st_ops_desc = st_ops_desc; > map = &st_map->map; > > ret = bpf_jit_charge_modmem(PAGE_SIZE); > - if (ret) { > - __bpf_struct_ops_map_free(map); > - return ERR_PTR(ret); > - } > + if (ret) > + goto errout_free; > > st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); > if (!st_map->image) { > @@ -713,23 +737,32 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) > * here. > */ > bpf_jit_uncharge_modmem(PAGE_SIZE); > - __bpf_struct_ops_map_free(map); > - return ERR_PTR(-ENOMEM); > + ret = -ENOMEM; > + goto errout_free; > } > st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); > st_map->links = > bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), > NUMA_NO_NODE); > if (!st_map->uvalue || !st_map->links) { > - __bpf_struct_ops_map_free(map); > - return ERR_PTR(-ENOMEM); > + ret = -ENOMEM; > + goto errout_free; > } > > mutex_init(&st_map->lock); > set_vm_flush_reset_perms(st_map->image); > bpf_map_init_from_attr(map, attr); > > + module_put(mod); > + > return map; > + > +errout_free: > + __bpf_struct_ops_map_free(map); > + btf = NULL; /* has been released */ > +errout: > + module_put(mod); > + return ERR_PTR(ret); > } > > static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) > @@ -811,6 +844,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) > * bpf_struct_ops_link_create() fails to register. > */ > st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); > + module_put(st_map->st_ops_desc->st_ops->owner); > bpf_map_put(&st_map->map); > } > kfree(st_link); > @@ -857,6 +891,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map > if (!bpf_struct_ops_valid_to_reg(new_map)) > return -EINVAL; > > + /* The old map is holding the refcount for the owner module. The > + * ownership of the owner module refcount is going to be > + * transferred from the old map to the new map. > + */ > if (!st_map->st_ops_desc->st_ops->update) > return -EOPNOTSUPP; > > @@ -902,6 +940,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) > struct bpf_link_primer link_primer; > struct bpf_struct_ops_map *st_map; > struct bpf_map *map; > + struct btf *btf; > int err; > > map = bpf_map_get(attr->link_create.map_fd); > @@ -926,8 +965,15 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) > if (err) > goto err_out; > > + /* Hold the owner module until the struct_ops is unregistered. */ > + btf = st_map->st_ops_desc->btf; > + if (btf != btf_vmlinux && !btf_try_get_module(btf)) { > + err = -EINVAL; > + goto err_out; > + } > err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data); > if (err) { > + module_put(st_map->st_ops_desc->st_ops->owner); > bpf_link_cleanup(&link_primer); > link = NULL; > goto err_out;