On Mon, Dec 23, 2019 at 11:57:48AM -0800, Yonghong Song wrote: [ ... ] > > +static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > > + void *value, u64 flags) > > +{ > > + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; > > + const struct bpf_struct_ops *st_ops = st_map->st_ops; > > + struct bpf_struct_ops_value *uvalue, *kvalue; > > + const struct btf_member *member; > > + const struct btf_type *t = st_ops->type; > > + void *udata, *kdata; > > + int prog_fd, err = 0; > > + void *image; > > + u32 i; > > + > > + if (flags) > > + return -EINVAL; > > + > > + if (*(u32 *)key != 0) > > + return -E2BIG; > > + > > + uvalue = (struct bpf_struct_ops_value *)value; > > + if (uvalue->state || refcount_read(&uvalue->refcnt)) > > + return -EINVAL; > > + > > + uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; > > + kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; > > + > > + spin_lock(&st_map->lock); > > + > > + if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) { > > + err = -EBUSY; > > + goto unlock; > > + } > > + > > + memcpy(uvalue, value, map->value_size); > > + > > + udata = &uvalue->data; > > + kdata = &kvalue->data; > > + image = st_map->image; > > + > > + for_each_member(i, t, member) { > > + const struct btf_type *mtype, *ptype; > > + struct bpf_prog *prog; > > + u32 moff; > > + > > + moff = btf_member_bit_offset(t, member) / 8; > > + mtype = btf_type_by_id(btf_vmlinux, member->type); > > + ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); > > + if (ptype == module_type) { > > + *(void **)(kdata + moff) = BPF_MODULE_OWNER; > > + continue; > > + } > > + > > + err = st_ops->init_member(t, member, kdata, udata); > > + if (err < 0) > > + goto reset_unlock; > > + > > + /* The ->init_member() has handled this member */ > > + if (err > 0) > > + continue; > > + > > + /* If st_ops->init_member does not handle it, > > + * we will only handle func ptrs and zero-ed members > > + * here. Reject everything else. > > + */ > > + > > + /* All non func ptr member must be 0 */ > > + if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, > > + NULL)) { > > + u32 msize; > > + > > + mtype = btf_resolve_size(btf_vmlinux, mtype, > > + &msize, NULL, NULL); > > + if (IS_ERR(mtype)) { > > + err = PTR_ERR(mtype); > > + goto reset_unlock; > > + } > > + > > + if (memchr_inv(udata + moff, 0, msize)) { > > + err = -EINVAL; > > + goto reset_unlock; > > + } > > + > > + continue; > > + } > > + > > + prog_fd = (int)(*(unsigned long *)(udata + moff)); > > + /* Similar check as the attr->attach_prog_fd */ > > + if (!prog_fd) > > + continue; > > + > > + prog = bpf_prog_get(prog_fd); > > + if (IS_ERR(prog)) { > > + err = PTR_ERR(prog); > > + goto reset_unlock; > > + } > > + st_map->progs[i] = prog; > > + > > + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || > > + prog->aux->attach_btf_id != st_ops->type_id || > > + prog->expected_attach_type != i) { > > + err = -EINVAL; > > + goto reset_unlock; > > + } > > + > > + err = arch_prepare_bpf_trampoline(image, > > + &st_ops->func_models[i], 0, > > + &prog, 1, NULL, 0, NULL); > > + if (err < 0) > > + goto reset_unlock; > > + > > + *(void **)(kdata + moff) = image; > > + image += err; > > + > > + /* put prog_id to udata */ > > + *(unsigned long *)(udata + moff) = prog->aux->id; > > + } > > Should we check whether user indeed provided `module` member or > not before declaring success? hmm.... By 'module', you meant the "if (ptype == module_type)" logic at the beginning? Yes, it should check user passed 0 also. will fix. > > +static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) > > +{ > > + const struct bpf_struct_ops *st_ops; > > + size_t map_total_size, st_map_size; > > + struct bpf_struct_ops_map *st_map; > > + const struct btf_type *t, *vt; > > + struct bpf_map_memory mem; > > + struct bpf_map *map; > > + int err; > > + > > + if (!capable(CAP_SYS_ADMIN)) > > + return ERR_PTR(-EPERM); > > + > > + st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); > > + if (!st_ops) > > + return ERR_PTR(-ENOTSUPP); > > + > > + vt = st_ops->value_type; > > + if (attr->value_size != vt->size) > > + return ERR_PTR(-EINVAL); > > + > > + t = st_ops->type; > > + > > + st_map_size = sizeof(*st_map) + > > + /* kvalue stores the > > + * struct bpf_struct_ops_tcp_congestions_ops > > + */ > > + (vt->size - sizeof(struct bpf_struct_ops_value)); > > + map_total_size = st_map_size + > > + /* uvalue */ > > + sizeof(vt->size) + > > + /* struct bpf_progs **progs */ > > + btf_type_vlen(t) * sizeof(struct bpf_prog *); > > + err = bpf_map_charge_init(&mem, map_total_size); > > + if (err < 0) > > + return ERR_PTR(err); > > + > > + st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); > > + if (!st_map) { > > + bpf_map_charge_finish(&mem); > > + return ERR_PTR(-ENOMEM); > > + } > > + st_map->st_ops = st_ops; > > + map = &st_map->map; > > + > > + st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); > > + st_map->progs = > > + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), > > + NUMA_NO_NODE); > > + /* Each trampoline costs < 64 bytes. Ensure one page > > + * is enough for max number of func ptrs. > > + */ > > + BUILD_BUG_ON(PAGE_SIZE / 64 < BPF_STRUCT_OPS_MAX_NR_MEMBERS); > > This maybe true for x86 now, but it may not hold up for future other > architectures. Not sure whether we should get the value for arch call > backs, or we just bail out during map update if we ever grow exceeds > one page. I will reuse the existing WARN_ON_ONCE() check in arch_prepare_bpf_trampoline(). Need to parameterize the end-of-image (i.e. the current PAGE_SIZE / 2 assumption).