Jesper Dangaard Brouer <brouer@xxxxxxxxxx> writes: > The devmap map-value can be read from BPF-prog side, and could be used for a > storage area per device. This could e.g. contain info on headers that need > to be added when packet egress this device. > > This patchset adds a dynamic storage member to struct bpf_devmap_val. More > importantly the struct bpf_devmap_val is made dynamic via leveraging and > requiring BTF for struct sizes above 4. The only mandatory struct member is > 'ifindex' with a fixed offset of zero. > > Signed-off-by: Jesper Dangaard Brouer <brouer@xxxxxxxxxx> > --- > kernel/bpf/devmap.c | 216 ++++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 185 insertions(+), 31 deletions(-) > > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > index 4ab67b2d8159..9cf2dadcc0fe 100644 > --- a/kernel/bpf/devmap.c > +++ b/kernel/bpf/devmap.c > @@ -48,6 +48,7 @@ > #include <net/xdp.h> > #include <linux/filter.h> > #include <trace/events/xdp.h> > +#include <linux/btf.h> > > #define DEV_CREATE_FLAG_MASK \ > (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue { > unsigned int count; > }; > > -/* DEVMAP values */ > +/* DEVMAP map-value layout. > + * > + * The struct data-layout of map-value is a configuration interface. > + * BPF-prog side have read-only access to this memory. > + * > + * The layout might be different than below, because some struct members are > + * optional. This is made dynamic by requiring userspace provides an BTF > + * description of the struct layout, when creating the BPF-map. Struct names > + * are important and part of API, as BTF use these names to identify members. > + */ > struct bpf_devmap_val { > - __u32 ifindex; /* device index */ > + __u32 ifindex; /* device index - mandatory */ > union { > int fd; /* prog fd on map write */ > __u32 id; /* prog id on map read */ > } bpf_prog; > + struct { > + /* This 'storage' member is meant as a dynamically sized area, > + * that BPF developer can redefine. As other members are added > + * overtime, this area can shrink, as size can be regained by > + * not using members above. Add new members above this struct. > + */ > + unsigned char data[24]; > + } storage; Why is this needed? Userspace already passes in the value_size, so why can't the kernel just use the BTF to pick out the values it cares about and let the rest be up to userspace? > }; > > struct bpf_dtab_netdev { > @@ -79,10 +97,18 @@ struct bpf_dtab_netdev { > struct bpf_devmap_val val; > }; > > +struct bpf_devmap_val_cfg { > + struct { > + int ifindex; > + int bpf_prog; > + } btf_offset; > +}; > + > struct bpf_dtab { > struct bpf_map map; > struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ > struct list_head list; > + struct bpf_devmap_val_cfg cfg; > > /* these are only used for DEVMAP_HASH type maps */ > struct hlist_head *dev_index_head; > @@ -116,20 +142,24 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, > > static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) > { > - __u32 valsize = attr->value_size; > u64 cost = 0; > int err; > > - /* check sanity of attributes. 2 value sizes supported: > - * 4 bytes: ifindex > - * 8 bytes: ifindex + prog fd > - */ > + /* Value contents validated in dev_map_check_btf */ > if (attr->max_entries == 0 || attr->key_size != 4 || > - (valsize != offsetofend(struct bpf_devmap_val, ifindex) && > - valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || > + attr->value_size > sizeof(struct bpf_devmap_val) || > attr->map_flags & ~DEV_CREATE_FLAG_MASK) > return -EINVAL; > > + /* Enforce BTF for userspace, unless dealing with legacy kABI */ > + if (attr->value_size != 4 && > + (!attr->btf_key_type_id || !attr->btf_value_type_id)) > + return -EOPNOTSUPP; > + > + /* Mark BTF offset's as invalid */ > + dtab->cfg.btf_offset.ifindex = -1; > + dtab->cfg.btf_offset.bpf_prog = -1; > + > /* Lookup returns a pointer straight to dev->ifindex, so make sure the > * verifier prevents writes from the BPF side > */ > @@ -199,6 +229,119 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) > return &dtab->map; > } > > +struct expect { > + u8 btf_kind; > + bool mandatory; > + int bit_offset; > + int size; > + const char *name; > +}; > + > +static int btf_find_expect_layout_offset(const struct btf *btf, > + const struct btf_type *value_type, > + const struct expect *layout) > +{ > + const struct btf_member *member; > + u32 i, off = -ENOENT; > + > + for_each_member(i, value_type, member) { > + const struct btf_type *member_type; > + const char *member_name; > + > + member_type = btf_type_skip_modifiers(btf, member->type, NULL); > + if (BTF_INFO_KIND(member_type->info) != layout->btf_kind) { > + continue; > + } > + > + member_name = btf_name_by_offset(btf, member->name_off); > + if (!member_name) > + return -EINVAL; > + > + if (strcmp(layout->name, member_name)) > + continue; > + > + if (layout->size > 0 && // btf_type_has_size(member_type) && > + member_type->size != layout->size) > + continue; > + > + off = btf_member_bit_offset(value_type, member); > + if (layout->bit_offset > 0 && > + layout->bit_offset != off) { > + off = -ENOENT; > + continue; > + } Won't this enforced offset cause problems for extensibility? Say we introduce a third struct member that the kernel understands, e.g. another u32 with expect->bit_offset=64. That would mean you can no longer skip members, since that would make any subsequent offset tests fail? Or am I misunderstanding how this is supposed to work? > + > + return off; > + } > + return off; > +} > + > +/* Expected BTF layout that match struct bpf_devmap_val */ > +static const struct expect layout[] = { > + {BTF_KIND_INT, true, 0, 4, "ifindex"}, > + {BTF_KIND_UNION, false, 32, 4, "bpf_prog"}, > + {BTF_KIND_STRUCT, false, -1, -1, "storage"} > +}; > + > +static int dev_map_check_btf(const struct bpf_map *map, > + const struct btf *btf, > + const struct btf_type *key_type, > + const struct btf_type *value_type) > +{ > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > + u32 found_members_cnt = 0; > + u32 int_data; > + int off; > + u32 i; > + > + /* Validate KEY type and size */ > + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) > + return -EOPNOTSUPP; > + > + int_data = *(u32 *)(key_type + 1); > + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0) > + return -EOPNOTSUPP; > + > + /* Validate VALUE have layout that match/map-to struct bpf_devmap_val > + * - With a flexible size of member 'storage'. > + */ > + > + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT) > + return -EOPNOTSUPP; > + > + /* Struct/union members in BTF must not exceed (max) expected members */ > + if (btf_type_vlen(value_type) > ARRAY_SIZE(layout)) > + return -E2BIG; > + > + for (i = 0; i < ARRAY_SIZE(layout); i++) { > + off = btf_find_expect_layout_offset(btf, value_type, &layout[i]); > + > + if (off < 0 && layout[i].mandatory) > + return -EUCLEAN; > + > + if (off >= 0) > + found_members_cnt++; > + > + /* Transfer layout config to map */ > + switch (i) { > + case 0: > + dtab->cfg.btf_offset.ifindex = off; > + break; > + case 1: > + dtab->cfg.btf_offset.bpf_prog = off; > + break; > + default: > + break; > + } > + } > + > + /* Detect if BTF/vlen have members that were not found */ > + if (btf_type_vlen(value_type) > found_members_cnt) > + return -E2BIG; > + > + return 0; > +} > + > static void dev_map_free(struct bpf_map *map) > { > struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > @@ -601,42 +744,53 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) > return ret; > } > > +static inline bool map_value_has_bpf_prog(const struct bpf_dtab *dtab) > +{ > + return dtab->cfg.btf_offset.bpf_prog >= 0; > +} > + > static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, > - struct bpf_dtab *dtab, > + struct bpf_map *map, > struct bpf_devmap_val *val, > unsigned int idx) > { > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > struct bpf_prog *prog = NULL; > struct bpf_dtab_netdev *dev; > > - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, > + dev = kzalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, > dtab->map.numa_node); > if (!dev) > return ERR_PTR(-ENOMEM); > > + /* Member: ifindex is mandatory, both BTF and kABI */ > dev->dev = dev_get_by_index(net, val->ifindex); > if (!dev->dev) > goto err_out; > > - if (val->bpf_prog.fd >= 0) { > - prog = bpf_prog_get_type_dev(val->bpf_prog.fd, > - BPF_PROG_TYPE_XDP, false); > - if (IS_ERR(prog)) > - goto err_put_dev; > - if (prog->expected_attach_type != BPF_XDP_DEVMAP) > - goto err_put_prog; > + /* Member: bpf_prog union is optional, but have fixed offset if exist */ > + if (map_value_has_bpf_prog(dtab)) { > + if (val->bpf_prog.fd >= 0) { > + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, > + BPF_PROG_TYPE_XDP, false); > + if (IS_ERR(prog)) > + goto err_put_dev; > + if (prog->expected_attach_type != BPF_XDP_DEVMAP) > + goto err_put_prog; > + } > + if (prog) { > + dev->xdp_prog = prog; > + val->bpf_prog.id = prog->aux->id; > + } else { > + dev->xdp_prog = NULL; > + val->bpf_prog.id = 0; > + } > } > - > dev->idx = idx; > dev->dtab = dtab; > - if (prog) { > - dev->xdp_prog = prog; > - dev->val.bpf_prog.id = prog->aux->id; > - } else { > - dev->xdp_prog = NULL; > - dev->val.bpf_prog.id = 0; > - } > - dev->val.ifindex = val->ifindex; > + > + /* After adjustment copy map value to get storage area */ > + memcpy(&dev->val, val, map->value_size); > > return dev; > err_put_prog: > @@ -672,7 +826,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, > if (val.bpf_prog.fd != -1) > return -EINVAL; > } else { > - dev = __dev_map_alloc_node(net, dtab, &val, i); > + dev = __dev_map_alloc_node(net, map, &val, i); > if (IS_ERR(dev)) > return PTR_ERR(dev); > } > @@ -717,7 +871,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, > if (old_dev && (map_flags & BPF_NOEXIST)) > goto out_err; > > - dev = __dev_map_alloc_node(net, dtab, &val, idx); > + dev = __dev_map_alloc_node(net, map, &val, idx); > if (IS_ERR(dev)) { > err = PTR_ERR(dev); > goto out_err; > @@ -762,7 +916,7 @@ const struct bpf_map_ops dev_map_ops = { > .map_lookup_elem = dev_map_lookup_elem, > .map_update_elem = dev_map_update_elem, > .map_delete_elem = dev_map_delete_elem, > - .map_check_btf = map_check_no_btf, > + .map_check_btf = dev_map_check_btf, > }; > > const struct bpf_map_ops dev_map_hash_ops = { > @@ -772,7 +926,7 @@ const struct bpf_map_ops dev_map_hash_ops = { > .map_lookup_elem = dev_map_hash_lookup_elem, > .map_update_elem = dev_map_hash_update_elem, > .map_delete_elem = dev_map_hash_delete_elem, > - .map_check_btf = map_check_no_btf, > + .map_check_btf = dev_map_check_btf, > }; > > static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,